summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorStefan Schmidt <stefan@datenfreihafen.org>2021-02-24 13:36:58 +0100
committerStefan Schmidt <stefan@datenfreihafen.org>2021-02-24 13:36:58 +0100
commitcdd38c5f1ce4398ec58fec95904b75824daab7b5 (patch)
tree639cf51fe8ee120a13e61b13d448aeaf4d044c74 /arch/x86
parent04052a318fb93491f1f3b4d282cb806f588e9326 (diff)
parentfcb3007371e1a4afb03280af1b336a83287fe115 (diff)
downloadlinux-cdd38c5f1ce4398ec58fec95904b75824daab7b5.tar.bz2
Merge remote-tracking branch 'net/master'
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig41
-rw-r--r--arch/x86/Kconfig.debug3
-rw-r--r--arch/x86/Makefile46
-rw-r--r--arch/x86/boot/code16gcc.h12
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c11
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S20
-rw-r--r--arch/x86/boot/compressed/misc.h3
-rw-r--r--arch/x86/boot/compressed/sev-es.c5
-rw-r--r--arch/x86/crypto/aes_glue.c1
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S20
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S20
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl2
-rw-r--r--arch/x86/crypto/poly1305_glue.c2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c2
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/entry/common.c44
-rw-r--r--arch/x86/entry/entry_64.S26
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/thunk_64.S15
-rw-r--r--arch/x86/entry/vdso/Makefile8
-rw-r--r--arch/x86/entry/vdso/extable.c46
-rw-r--r--arch/x86/entry/vdso/extable.h28
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S9
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso2c.c2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h50
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S2
-rw-r--r--arch/x86/entry/vdso/vma.c36
-rw-r--r--arch/x86/entry/vdso/vsgx.S151
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/events/amd/core.c2
-rw-r--r--arch/x86/events/core.c32
-rw-r--r--arch/x86/events/intel/core.c579
-rw-r--r--arch/x86/events/intel/cstate.c25
-rw-r--r--arch/x86/events/intel/ds.c201
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/uncore.c68
-rw-r--r--arch/x86/events/intel/uncore.h17
-rw-r--r--arch/x86/events/intel/uncore_snb.c24
-rw-r--r--arch/x86/events/intel/uncore_snbep.c114
-rw-r--r--arch/x86/events/msr.c1
-rw-r--r--arch/x86/events/perf_event.h26
-rw-r--r--arch/x86/events/probe.c7
-rw-r--r--arch/x86/events/probe.h7
-rw-r--r--arch/x86/events/rapl.c48
-rw-r--r--arch/x86/hyperv/hv_apic.c14
-rw-r--r--arch/x86/hyperv/hv_init.c33
-rw-r--r--arch/x86/hyperv/mmu.c12
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/apic.h26
-rw-r--r--arch/x86/include/asm/apicdef.h16
-rw-r--r--arch/x86/include/asm/atomic.h2
-rw-r--r--arch/x86/include/asm/atomic64_64.h2
-rw-r--r--arch/x86/include/asm/barrier.h18
-rw-r--r--arch/x86/include/asm/cacheinfo.h4
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/compat.h26
-rw-r--r--arch/x86/include/asm/copy_mc_test.h75
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h19
-rw-r--r--arch/x86/include/asm/disabled-features.h11
-rw-r--r--arch/x86/include/asm/efi.h51
-rw-r--r--arch/x86/include/asm/elf.h17
-rw-r--r--arch/x86/include/asm/elfcore-compat.h31
-rw-r--r--arch/x86/include/asm/enclu.h9
-rw-r--r--arch/x86/include/asm/entry-common.h2
-rw-r--r--arch/x86/include/asm/fixmap.h15
-rw-r--r--arch/x86/include/asm/fpu/api.h50
-rw-r--r--arch/x86/include/asm/ftrace.h18
-rw-r--r--arch/x86/include/asm/highmem.h13
-rw-r--r--arch/x86/include/asm/hpet.h11
-rw-r--r--arch/x86/include/asm/hw_irq.h14
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h7
-rw-r--r--arch/x86/include/asm/idtentry.h10
-rw-r--r--arch/x86/include/asm/insn.h15
-rw-r--r--arch/x86/include/asm/inst.h15
-rw-r--r--arch/x86/include/asm/intel-family.h1
-rw-r--r--arch/x86/include/asm/io_apic.h79
-rw-r--r--arch/x86/include/asm/iomap.h13
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_remapping.h9
-rw-r--r--arch/x86/include/asm/irqdomain.h3
-rw-r--r--arch/x86/include/asm/irqflags.h46
-rw-r--r--arch/x86/include/asm/kmap_types.h13
-rw-r--r--arch/x86/include/asm/kprobes.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h41
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mce.h31
-rw-r--r--arch/x86/include/asm/mem_encrypt.h2
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmu.h9
-rw-r--r--arch/x86/include/asm/mmu_context.h8
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/msi.h50
-rw-r--r--arch/x86/include/asm/msidef.h57
-rw-r--r--arch/x86/include/asm/msr-index.h13
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/page_32_types.h8
-rw-r--r--arch/x86/include/asm/page_64_types.h8
-rw-r--r--arch/x86/include/asm/paravirt.h41
-rw-r--r--arch/x86/include/asm/paravirt_types.h18
-rw-r--r--arch/x86/include/asm/perf_event.h28
-rw-r--r--arch/x86/include/asm/pgtable_32.h18
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h6
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/preempt.h48
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/resctrl.h11
-rw-r--r--arch/x86/include/asm/seccomp.h20
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/sparsemem.h10
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/static_call.h7
-rw-r--r--arch/x86/include/asm/svm.h40
-rw-r--r--arch/x86/include/asm/sync_core.h9
-rw-r--r--arch/x86/include/asm/thermal.h13
-rw-r--r--arch/x86/include/asm/thread_info.h17
-rw-r--r--arch/x86/include/asm/tlb.h1
-rw-r--r--arch/x86/include/asm/topology.h9
-rw-r--r--arch/x86/include/asm/trap_pf.h2
-rw-r--r--arch/x86/include/asm/uv/bios.h51
-rw-r--r--arch/x86/include/asm/uv/uv.h10
-rw-r--r--arch/x86/include/asm/uv/uv_geo.h103
-rw-r--r--arch/x86/include/asm/vdso.h7
-rw-r--r--arch/x86/include/asm/vm86.h1
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/page.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h1
-rw-r--r--arch/x86/include/uapi/asm/sgx.h168
-rw-r--r--arch/x86/include/uapi/asm/signal.h24
-rw-r--r--arch/x86/include/uapi/asm/svm.h28
-rw-r--r--arch/x86/include/uapi/asm/vm86.h4
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/apei.c5
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/apic/apic.c77
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c18
-rw-r--r--arch/x86/kernel/apic/apic_noop.c10
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c16
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c9
-rw-r--r--arch/x86/kernel/apic/io_apic.c525
-rw-r--r--arch/x86/kernel/apic/ipi.c6
-rw-r--r--arch/x86/kernel/apic/msi.c153
-rw-r--r--arch/x86/kernel/apic/probe_32.c9
-rw-r--r--arch/x86/kernel/apic/vector.c73
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c16
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c26
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c66
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c42
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c16
-rw-r--r--arch/x86/kernel/cpu/bugs.c55
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c38
-rw-r--r--arch/x86/kernel/cpu/hygon.c31
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c4
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c61
-rw-r--r--arch/x86/kernel/cpu/mce/core.c78
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c4
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c22
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c739
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c63
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c47
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c7
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h7
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c88
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c228
-rw-r--r--arch/x86/kernel/cpu/scattered.c4
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile5
-rw-r--r--arch/x86/kernel/cpu/sgx/arch.h338
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c197
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h29
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c737
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h119
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h231
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c716
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c737
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h86
-rw-r--r--arch/x86/kernel/cpu/topology.c12
-rw-r--r--arch/x86/kernel/cpu/vmware.c12
-rw-r--r--arch/x86/kernel/cpuid.c7
-rw-r--r--arch/x86/kernel/crash_dump_32.c48
-rw-r--r--arch/x86/kernel/devicetree.c30
-rw-r--r--arch/x86/kernel/dumpstack.c25
-rw-r--r--arch/x86/kernel/fpu/core.c9
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/ftrace_64.S15
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_64.S45
-rw-r--r--arch/x86/kernel/hpet.c122
-rw-r--r--arch/x86/kernel/hw_breakpoint.c61
-rw-r--r--arch/x86/kernel/ima_arch.c94
-rw-r--r--arch/x86/kernel/irq.c21
-rw-r--r--arch/x86/kernel/irqflags.S11
-rw-r--r--arch/x86/kernel/kprobes/core.c172
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c15
-rw-r--r--arch/x86/kernel/kprobes/opt.c22
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c15
-rw-r--r--arch/x86/kernel/nmi.c6
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/paravirt_patch.c10
-rw-r--r--arch/x86/kernel/pci-iommu_table.c3
-rw-r--r--arch/x86/kernel/perf_regs.c17
-rw-r--r--arch/x86/kernel/process.c12
-rw-r--r--arch/x86/kernel/process_64.c28
-rw-r--r--arch/x86/kernel/ptrace.c46
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c11
-rw-r--r--arch/x86/kernel/sev-es-shared.c30
-rw-r--r--arch/x86/kernel/sev-es.c34
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S89
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/signal_compat.c9
-rw-r--r--arch/x86/kernel/smpboot.c108
-rw-r--r--arch/x86/kernel/static_call.c17
-rw-r--r--arch/x86/kernel/step.c10
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/tboot.c9
-rw-r--r--arch/x86/kernel/topology.c1
-rw-r--r--arch/x86/kernel/traps.c69
-rw-r--r--arch/x86/kernel/uprobes.c12
-rw-r--r--arch/x86/kernel/vm86_32.c62
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/cpuid.c34
-rw-r--r--arch/x86/kvm/cpuid.h15
-rw-r--r--arch/x86/kvm/emulate.c10
-rw-r--r--arch/x86/kvm/hyperv.c6
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/irq.c85
-rw-r--r--arch/x86/kvm/irq_comm.c31
-rw-r--r--arch/x86/kvm/lapic.c49
-rw-r--r--arch/x86/kvm/mmu.h11
-rw-r--r--arch/x86/kvm/mmu/mmu.c85
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h29
-rw-r--r--arch/x86/kvm/mmu/spte.c20
-rw-r--r--arch/x86/kvm/mmu/spte.h41
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c146
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h4
-rw-r--r--arch/x86/kvm/mtrr.c6
-rw-r--r--arch/x86/kvm/svm/avic.c9
-rw-r--r--arch/x86/kvm/svm/nested.c35
-rw-r--r--arch/x86/kvm/svm/sev.c953
-rw-r--r--arch/x86/kvm/svm/svm.c504
-rw-r--r--arch/x86/kvm/svm/svm.h172
-rw-r--r--arch/x86/kvm/svm/vmenter.S50
-rw-r--r--arch/x86/kvm/trace.h97
-rw-r--r--arch/x86/kvm/vmx/evmcs.c6
-rw-r--r--arch/x86/kvm/vmx/evmcs.h3
-rw-r--r--arch/x86/kvm/vmx/nested.c103
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c198
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c678
-rw-r--r--arch/x86/kvm/x86.h41
-rw-r--r--arch/x86/lib/copy_mc.c4
-rw-r--r--arch/x86/lib/copy_mc_64.S10
-rw-r--r--arch/x86/lib/insn-eval.c10
-rw-r--r--arch/x86/lib/memcpy_64.S4
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S4
-rw-r--r--arch/x86/lib/mmx_32.c20
-rw-r--r--arch/x86/lib/msr-smp.c7
-rw-r--r--arch/x86/lib/usercopy.c22
-rw-r--r--arch/x86/mm/fault.c436
-rw-r--r--arch/x86/mm/highmem_32.c59
-rw-r--r--arch/x86/mm/ident_map.c12
-rw-r--r--arch/x86/mm/init.c25
-rw-r--r--arch/x86/mm/init_32.c15
-rw-r--r--arch/x86/mm/iomap_32.c57
-rw-r--r--arch/x86/mm/mem_encrypt.c38
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c4
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/mm/pat/set_memory.c4
-rw-r--r--arch/x86/mm/pgtable.c2
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c422
-rw-r--r--arch/x86/net/bpf_jit_comp32.c6
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c127
-rw-r--r--arch/x86/oprofile/init.c38
-rw-r--r--arch/x86/oprofile/nmi_int.c780
-rw-r--r--arch/x86/oprofile/op_counter.h30
-rw-r--r--arch/x86/oprofile/op_model_amd.c542
-rw-r--r--arch/x86/oprofile/op_model_p4.c723
-rw-r--r--arch/x86/oprofile/op_model_ppro.c245
-rw-r--r--arch/x86/oprofile/op_x86_model.h90
-rw-r--r--arch/x86/pci/i386.c6
-rw-r--r--arch/x86/pci/init.c15
-rw-r--r--arch/x86/pci/intel_mid_pci.c8
-rw-r--r--arch/x86/pci/mmconfig-shared.c4
-rw-r--r--arch/x86/pci/sta2x11-fixup.c3
-rw-r--r--arch/x86/pci/xen.c26
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/efi/efi_64.c76
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S6
-rw-r--r--arch/x86/platform/efi/quirks.c16
-rw-r--r--arch/x86/platform/geode/alix.c19
-rw-r--r--arch/x86/platform/geode/geos.c19
-rw-r--r--arch/x86/platform/geode/net5501.c13
-rw-r--r--arch/x86/platform/goldfish/Makefile2
-rw-r--r--arch/x86/platform/goldfish/goldfish.c54
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c4
-rw-r--r--arch/x86/platform/uv/Makefile2
-rw-r--r--arch/x86/platform/uv/bios_uv.c55
-rw-r--r--arch/x86/platform/uv/uv_irq.c4
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c63
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/tools/relocs.c12
-rw-r--r--arch/x86/xen/Kconfig38
-rw-r--r--arch/x86/xen/apic.c7
-rw-r--r--arch/x86/xen/efi.c37
-rw-r--r--arch/x86/xen/enlighten_hvm.c15
-rw-r--r--arch/x86/xen/enlighten_pv.c47
-rw-r--r--arch/x86/xen/irq.c23
-rw-r--r--arch/x86/xen/p2m.c12
-rw-r--r--arch/x86/xen/smp_hvm.c29
-rw-r--r--arch/x86/xen/spinlock.c12
-rw-r--r--arch/x86/xen/xen-asm.S53
-rw-r--r--arch/x86/xen/xen-ops.h3
358 files changed, 11033 insertions, 7535 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..595193bc2d31 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,10 +14,12 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION
select CLKSRC_I8253
select CLONE_BACKWARDS
+ select GENERIC_VDSO_32
select HAVE_DEBUG_STACKOVERFLOW
+ select KMAP_LOCAL
select MODULES_USE_ELF_REL
select OLD_SIGACTION
- select GENERIC_VDSO_32
+ select ARCH_SPLIT_ARG64
config X86_64
def_bool y
@@ -30,6 +32,7 @@ config X86_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
+ select ARCH_HAS_ELFCORE_COMPAT
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -91,7 +94,9 @@ config X86
select ARCH_STACKWALK
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
@@ -100,6 +105,7 @@ config X86
select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_HUGE_PMD_SHARE
+ select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
@@ -108,7 +114,6 @@ config X86
select DCACHE_WORD_ACCESS
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
- select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
@@ -162,11 +167,13 @@ config X86
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_CONTEXT_TRACKING if X86_64
+ select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -198,8 +205,8 @@ config X86
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_MOVE_PMD
+ select HAVE_MOVE_PUD
select HAVE_NMI
- select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
@@ -217,6 +224,7 @@ config X86
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -329,9 +337,6 @@ config ZONE_DMA32
config AUDIT_ARCH
def_bool y if X86_64
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
- def_bool y
-
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
@@ -886,7 +891,7 @@ config HPET_TIMER
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER
def_bool y if X86_INTEL_MID
@@ -1154,10 +1159,6 @@ config X86_MCE_INJECT
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_INTEL
-
source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
@@ -1930,6 +1931,23 @@ config X86_INTEL_TSX_MODE_AUTO
side channel attacks- equals the tsx=auto command line parameter.
endchoice
+config X86_SGX
+ bool "Software Guard eXtensions (SGX)"
+ depends on X86_64 && CPU_SUP_INTEL
+ depends on CRYPTO=y
+ depends on CRYPTO_SHA256=y
+ select SRCU
+ select MMU_NOTIFIER
+ help
+ Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
+ that can be used by applications to set aside private regions of code
+ and data, referred to as enclaves. An enclave's private memory can
+ only be accessed by code running within the enclave. Accesses from
+ outside the enclave, including other enclaves, are disallowed by
+ hardware.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2843,7 +2861,6 @@ config IA32_EMULATION
depends on X86_64
select ARCH_WANT_OLD_COMPAT_IPC
select BINFMT_ELF
- select COMPAT_BINFMT_ELF
select COMPAT_OLD_SIGACTION
help
Include code to run legacy 32-bit programs under a
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 27b5e2bc6a01..80b57e7f4947 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -62,9 +62,6 @@ config EARLY_PRINTK_USB_XDBC
You should normally say N here, unless you want to debug early
crashes or need a very simple printk logging facility.
-config COPY_MC_TEST
- def_bool n
-
config EFI_PGT_DUMP
bool "Dump the EFI pagetable"
depends on EFI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 154259f18b8b..b797f1561943 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -24,14 +24,7 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-#
-# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
-# older versions of GCC, include an *assembly* header to make sure that
-# gcc doesn't play any games behind our back.
-CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
-M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
-
-REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
+REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse
@@ -57,6 +50,9 @@ export BITS
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+# Intel CET isn't enabled in the kernel
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@@ -209,9 +205,6 @@ ifdef CONFIG_X86_64
LDFLAGS_vmlinux += -z max-page-size=0x200000
endif
-# We never want expected sections to be placed heuristically by the
-# linker. All sections should be explicitly named in the linker script.
-LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn)
archscripts: scripts_basic
$(Q)$(MAKE) $(build)=arch/x86/tools relocs
@@ -239,9 +232,6 @@ core-y += arch/x86/
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
-# must be linked after kernel/
-drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
-
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
@@ -302,16 +292,20 @@ archclean:
$(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
- echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/$(INSTALLKERNEL) or'
- echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
- echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to '
+ echo ' $$(INSTALL_PATH) and run lilo'
+ echo ''
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+ echo ''
+ echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
+ echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+
endef
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
deleted file mode 100644
index e19fd7536307..000000000000
--- a/arch/x86/boot/code16gcc.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#
-# code16gcc.h
-#
-# This file is added to the assembler via -Wa when compiling 16-bit C code.
-# This is done this way instead via asm() to make sure gcc does not reorder
-# things around us.
-#
-# gcc 4.9+ has a real -m16 option so we can drop this hack long term.
-#
-
- .code16gcc
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ee249088cbfe..e0bc3988c3fa 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse
-KBUILD_CFLAGS += -ffreestanding
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
@@ -61,7 +61,9 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
# Compressed kernel should be built as PIE since it may be loaded at any
# address by the bootloader.
LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker)
-LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn)
+ifdef CONFIG_LD_ORPHAN_WARN
+LDFLAGS_vmlinux += --orphan-handling=warn
+endif
LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 017de6cc87dc..e94874f4bbc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -241,12 +241,12 @@ SYM_FUNC_START(startup_32)
leal rva(startup_64)(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
movl rva(efi32_boot_args)(%ebp), %edi
- cmp $0, %edi
+ testl %edi, %edi
jz 1f
leal rva(efi64_stub_entry)(%ebp), %eax
movl rva(efi32_boot_args+4)(%ebp), %esi
movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer
- cmpl $0, %edx
+ testl %edx, %edx
jnz 1f
/*
* efi_pe_entry uses MS calling convention, which requires 32 bytes of
@@ -592,7 +592,7 @@ SYM_CODE_START(trampoline_32bit_src)
movl %eax, %cr0
/* Check what paging mode we want to be in after the trampoline */
- cmpl $0, %edx
+ testl %edx, %edx
jz 1f
/* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
@@ -622,7 +622,7 @@ SYM_CODE_START(trampoline_32bit_src)
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
- cmpl $0, %edx
+ testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
1:
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index a5e5db6ada3c..f7213d0943b8 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -164,16 +164,7 @@ void initialize_identity_maps(void *rmode)
add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
/* Load the new page-table. */
- write_cr3(top_level_pgt);
-}
-
-/*
- * This switches the page tables to the new level4 that has been built
- * via calls to add_identity_map() above. If booted via startup_32(),
- * this is effectively a no-op.
- */
-void finalize_identity_maps(void)
-{
+ sev_verify_cbit(top_level_pgt);
write_cr3(top_level_pgt);
}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index dd07e7b41b11..aa561795efd1 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -68,6 +68,9 @@ SYM_FUNC_START(get_sev_encryption_bit)
SYM_FUNC_END(get_sev_encryption_bit)
.code64
+
+#include "../../kernel/sev_verify_cbit.S"
+
SYM_FUNC_START(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
push %rbp
@@ -81,6 +84,19 @@ SYM_FUNC_START(set_sev_encryption_mask)
bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
+ /*
+ * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in
+ * get_sev_encryption_bit() because this function is 32-bit code and
+ * shared between 64-bit and 32-bit boot path.
+ */
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+
+ /* Store MSR value in sev_status */
+ shlq $32, %rdx
+ orq %rdx, %rax
+ movq %rax, sev_status(%rip)
+
.Lno_sev_mask:
movq %rbp, %rsp /* Restore original stack pointer */
@@ -96,5 +112,7 @@ SYM_FUNC_END(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
-SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sev_status, .quad 0)
+SYM_DATA(sev_check_data, .quad 0)
#endif
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 6d31f1b4c4d1..901ea5ebec22 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,6 +12,7 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
@@ -159,4 +160,6 @@ void boot_page_fault(void);
void boot_stage1_vc(void);
void boot_stage2_vc(void);
+unsigned long sev_verify_cbit(unsigned long cr3);
+
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c
index 954cb2702e23..27826c265aab 100644
--- a/arch/x86/boot/compressed/sev-es.c
+++ b/arch/x86/boot/compressed/sev-es.c
@@ -32,13 +32,12 @@ struct ghcb *boot_ghcb;
*/
static bool insn_has_rep_prefix(struct insn *insn)
{
+ insn_byte_t p;
int i;
insn_get_prefixes(insn);
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- insn_byte_t p = insn->prefixes.bytes[i];
-
+ for_each_insn_prefix(insn, i, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
deleted file mode 100644
index 7b7dc05fa1a4..000000000000
--- a/arch/x86/crypto/aes_glue.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 1852b19a73a0..d1436c37008b 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -318,7 +318,7 @@ _initial_blocks_\@:
# Main loop - Encrypt/Decrypt remaining blocks
- cmp $0, %r13
+ test %r13, %r13
je _zero_cipher_left_\@
sub $64, %r13
je _four_cipher_left_\@
@@ -437,7 +437,7 @@ _multiple_of_16_bytes_\@:
mov PBlockLen(%arg2), %r12
- cmp $0, %r12
+ test %r12, %r12
je _partial_done\@
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
@@ -474,7 +474,7 @@ _T_8_\@:
add $8, %r10
sub $8, %r11
psrldq $8, %xmm0
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done_\@
_T_4_\@:
movd %xmm0, %eax
@@ -482,7 +482,7 @@ _T_4_\@:
add $4, %r10
sub $4, %r11
psrldq $4, %xmm0
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done_\@
_T_123_\@:
movd %xmm0, %eax
@@ -619,7 +619,7 @@ _get_AAD_blocks\@:
/* read the last <16B of AAD */
_get_AAD_rest\@:
- cmp $0, %r11
+ test %r11, %r11
je _get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
@@ -640,7 +640,7 @@ _get_AAD_done\@:
.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH operation
mov PBlockLen(%arg2), %r13
- cmp $0, %r13
+ test %r13, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
@@ -692,7 +692,7 @@ _no_extra_mask_1_\@:
pshufb %xmm2, %xmm3
pxor %xmm3, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
@@ -727,7 +727,7 @@ _no_extra_mask_2_\@:
pshufb %xmm2, %xmm9
pxor %xmm9, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
@@ -747,7 +747,7 @@ _encode_done_\@:
pshufb %xmm2, %xmm9
.endif
# output encrypted Bytes
- cmp $0, %r10
+ test %r10, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
@@ -2720,7 +2720,7 @@ SYM_FUNC_END(aesni_ctr_enc)
*/
SYM_FUNC_START(aesni_xts_crypt8)
FRAME_BEGIN
- cmpb $0, %cl
+ testb %cl, %cl
movl $0, %ecx
movl $240, %r10d
leaq _aesni_enc4, %r11
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 5fee47956f3b..2cf8e94d986a 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -369,7 +369,7 @@ _initial_num_blocks_is_0\@:
_initial_blocks_encrypted\@:
- cmp $0, %r13
+ test %r13, %r13
je _zero_cipher_left\@
sub $128, %r13
@@ -528,7 +528,7 @@ _multiple_of_16_bytes\@:
vmovdqu HashKey(arg2), %xmm13
mov PBlockLen(arg2), %r12
- cmp $0, %r12
+ test %r12, %r12
je _partial_done\@
#GHASH computation for the last <16 Byte block
@@ -573,7 +573,7 @@ _T_8\@:
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
@@ -581,7 +581,7 @@ _T_4\@:
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
@@ -625,7 +625,7 @@ _get_AAD_blocks\@:
cmp $16, %r11
jge _get_AAD_blocks\@
vmovdqu \T8, \T7
- cmp $0, %r11
+ test %r11, %r11
je _get_AAD_done\@
vpxor \T7, \T7, \T7
@@ -644,7 +644,7 @@ _get_AAD_rest8\@:
vpxor \T1, \T7, \T7
jmp _get_AAD_rest8\@
_get_AAD_rest4\@:
- cmp $0, %r11
+ test %r11, %r11
jle _get_AAD_rest0\@
mov (%r10), %eax
movq %rax, \T1
@@ -749,7 +749,7 @@ _done_read_partial_block_\@:
.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH ENC_DEC
mov PBlockLen(arg2), %r13
- cmp $0, %r13
+ test %r13, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
@@ -801,7 +801,7 @@ _no_extra_mask_1_\@:
vpshufb %xmm2, %xmm3, %xmm3
vpxor %xmm3, \AAD_HASH, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
@@ -836,7 +836,7 @@ _no_extra_mask_2_\@:
vpshufb %xmm2, %xmm9, %xmm9
vpxor %xmm9, \AAD_HASH, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
@@ -856,7 +856,7 @@ _encode_done_\@:
vpshufb %xmm2, %xmm9, %xmm9
.endif
# output encrypted Bytes
- cmp $0, %r10
+ test %r10, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 7d568012cc15..71fae5a09e56 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -251,7 +251,7 @@ $code.=<<___;
mov %rax,8($ctx)
mov %rax,16($ctx)
- cmp \$0,$inp
+ test $inp,$inp
je .Lno_key
___
$code.=<<___ if (!$kernel);
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index c44aba290fbb..646da46e8d10 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -210,7 +210,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
}
poly1305_simd_emit(&dctx->h, dst, dctx->s);
- *dctx = (struct poly1305_desc_ctx){};
+ memzero_explicit(dctx, sizeof(*dctx));
}
EXPORT_SYMBOL(poly1305_final_arch);
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 18200135603f..44340a1139e0 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -22,7 +22,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <asm/simd.h>
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index dd06249229e1..3a5f6be7dbba 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -35,7 +35,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <linux/string.h>
#include <asm/simd.h>
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 63470fd6ae32..684d58c8bc4f 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -278,7 +278,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_avx)
- cmp $0, msglen
+ test msglen, msglen
je nowork
# Allocate Stack Space
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 7946a1bee85b..50812af0b083 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -280,7 +280,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
########################################################################
SYM_FUNC_START(sha512_transform_ssse3)
- cmp $0, msglen
+ test msglen, msglen
je nowork
# Allocate Stack Space
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index b0b05c93409e..30e70f4fe2f7 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,7 +34,7 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <asm/simd.h>
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..0904f5676e4d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -73,10 +73,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
unsigned int nr)
{
if (likely(nr < IA32_NR_syscalls)) {
- instrumentation_begin();
nr = array_index_nospec(nr, IA32_NR_syscalls);
regs->ax = ia32_sys_call_table[nr](regs);
- instrumentation_end();
}
}
@@ -91,8 +89,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
* or may not be necessary, but it matches the old asm behavior.
*/
nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
syscall_exit_to_user_mode(regs);
}
@@ -121,11 +122,12 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
res = get_user(*(u32 *)&regs->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp);
}
- instrumentation_end();
if (res) {
/* User code screwed up. */
regs->ax = -EFAULT;
+
+ instrumentation_end();
syscall_exit_to_user_mode(regs);
return false;
}
@@ -135,6 +137,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
syscall_exit_to_user_mode(regs);
return true;
}
@@ -209,40 +213,6 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}
-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
-{
- bool irq_state = lockdep_hardirqs_enabled();
-
- __nmi_enter();
- lockdep_hardirqs_off(CALLER_ADDR0);
- lockdep_hardirq_enter();
- rcu_nmi_enter();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- ftrace_nmi_enter();
- instrumentation_end();
-
- return irq_state;
-}
-
-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
-{
- instrumentation_begin();
- ftrace_nmi_exit();
- if (restore) {
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- }
- instrumentation_end();
-
- rcu_nmi_exit();
- lockdep_hardirq_exit();
- if (restore)
- lockdep_hardirqs_on(CALLER_ADDR0);
- __nmi_exit();
-}
-
#ifdef CONFIG_XEN_PV
#ifndef CONFIG_PREEMPTION
/*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..ce0464d630a2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -46,14 +46,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT_XXL
-SYM_CODE_START(native_usergs_sysret64)
- UNWIND_HINT_EMPTY
- swapgs
- sysretq
-SYM_CODE_END(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT_XXL */
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
+ * In the Xen PV case we must use iret anyway.
*/
+
+ ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
+ X86_FEATURE_XENPV
+
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
@@ -215,7 +212,8 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
- USERGS_SYSRET64
+ swapgs
+ sysretq
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -669,7 +667,7 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS /* to kernel GS */
+ swapgs /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
@@ -699,7 +697,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
- SWAPGS /* to user GS */
+ swapgs /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
@@ -943,7 +941,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
ret
.Lparanoid_entry_swapgs:
- SWAPGS
+ swapgs
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
@@ -1001,7 +999,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GSBASE */
- SWAPGS_UNSAFE_STACK
+ swapgs
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
@@ -1426,7 +1424,7 @@ nmi_no_fsgsbase:
jnz nmi_restore
nmi_swapgs:
- SWAPGS_UNSAFE_STACK
+ swapgs
nmi_restore:
POP_REGS
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0d0667a9fbd7..874aeacde2dd 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -445,3 +445,4 @@
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise
+441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..78672124d28b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index ccd32877a3c4..496b11ec469d 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -10,7 +10,7 @@
#include <asm/export.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro THUNK name, func, put_ret_addr_in_rdi=0
+ .macro THUNK name, func
SYM_FUNC_START_NOALIGN(\name)
pushq %rbp
movq %rsp, %rbp
@@ -25,13 +25,8 @@ SYM_FUNC_START_NOALIGN(\name)
pushq %r10
pushq %r11
- .if \put_ret_addr_in_rdi
- /* 8(%rbp) is return addr on stack */
- movq 8(%rbp), %rdi
- .endif
-
call \func
- jmp .L_restore
+ jmp __thunk_restore
SYM_FUNC_END(\name)
_ASM_NOKPROBE(\name)
.endm
@@ -44,7 +39,7 @@ SYM_FUNC_END(\name)
#endif
#ifdef CONFIG_PREEMPTION
-SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
+SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
popq %r11
popq %r10
popq %r9
@@ -56,6 +51,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
popq %rdi
popq %rbp
ret
- _ASM_NOKPROBE(.L_restore)
-SYM_CODE_END(.L_restore)
+ _ASM_NOKPROBE(__thunk_restore)
+SYM_CODE_END(__thunk_restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 21243747965d..02e3e42f380b 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -27,9 +27,10 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
vobjs32-y += vdso32/vclock_gettime.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
# files to link into kernel
-obj-y += vma.o
+obj-y += vma.o extable.o
KASAN_SANITIZE_vma.o := y
UBSAN_SANITIZE_vma.o := y
KCSAN_SANITIZE_vma.o := y
@@ -98,6 +99,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS
CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vsgx.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.
@@ -128,8 +130,8 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE
targets += vdsox32.lds $(vobjx32s-y)
-$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg FORCE
+$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
+$(obj)/%.so: $(obj)/%.so.dbg
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
new file mode 100644
index 000000000000..afcf5b65beef
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <asm/current.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+
+struct vdso_exception_table_entry {
+ int insn, fixup;
+};
+
+bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, unsigned long fault_addr)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ const struct vdso_exception_table_entry *extable;
+ unsigned int nr_entries, i;
+ unsigned long base;
+
+ /*
+ * Do not attempt to fixup #DB or #BP. It's impossible to identify
+ * whether or not a #DB/#BP originated from within an SGX enclave and
+ * SGX enclaves are currently the only use case for vDSO fixup.
+ */
+ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
+ return false;
+
+ if (!current->mm->context.vdso)
+ return false;
+
+ base = (unsigned long)current->mm->context.vdso + image->extable_base;
+ nr_entries = image->extable_len / (sizeof(*extable));
+ extable = image->extable;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (regs->ip == base + extable[i].insn) {
+ regs->ip = base + extable[i].fixup;
+ regs->di = trapnr;
+ regs->si = error_code;
+ regs->dx = fault_addr;
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
new file mode 100644
index 000000000000..b56f6b012941
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_EXTABLE_H
+#define __VDSO_EXTABLE_H
+
+/*
+ * Inject exception fixup for vDSO code. Unlike normal exception fixup,
+ * vDSO uses a dedicated handler the addresses are relative to the overall
+ * exception table, not each individual entry.
+ */
+#ifdef __ASSEMBLY__
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ASM_VDSO_EXTABLE_HANDLE from to
+
+.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req
+ .pushsection __ex_table, "a"
+ .long (\from) - __ex_table
+ .long (\to) - __ex_table
+ .popsection
+.endm
+#else
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ".pushsection __ex_table, \"a\"\n" \
+ ".long (" #from ") - __ex_table\n" \
+ ".long (" #to ") - __ex_table\n" \
+ ".popsection\n"
+#endif
+
+#endif /* __VDSO_EXTABLE_H */
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index 4d152933547d..dc8da7695859 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -75,11 +75,18 @@ SECTIONS
* stuff that isn't used at runtime in between.
*/
- .text : { *(.text*) } :text =0x90909090,
+ .text : {
+ *(.text*)
+ *(.fixup)
+ } :text =0x90909090,
+
+
.altinstructions : { *(.altinstructions) } :text
.altinstr_replacement : { *(.altinstr_replacement) } :text
+ __ex_table : { *(__ex_table) } :text
+
/DISCARD/ : {
*(.discard)
*(.discard.*)
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 36b644e16272..4bf48462fca7 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -27,6 +27,7 @@ VERSION {
__vdso_time;
clock_getres;
__vdso_clock_getres;
+ __vdso_sgx_enter_enclave;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 7380908045c7..2d0f3d8bcc25 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -101,6 +101,8 @@ struct vdso_sym required_syms[] = {
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
{"int80_landing_pad", true},
+ {"vdso32_rt_sigreturn_landing_pad", true},
+ {"vdso32_sigreturn_landing_pad", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 6f46e11ce539..1c7cfac7e64a 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -5,6 +5,41 @@
* are built for 32-bit userspace.
*/
+static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ", (int)(data)[i]);
+ }
+}
+
+
+/*
+ * Extract a section from the input data into a standalone blob. Used to
+ * capture kernel-only data that needs to persist indefinitely, e.g. the
+ * exception fixup tables, but only in the kernel, i.e. the section can
+ * be stripped from the final vDSO image.
+ */
+static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
+ FILE *outfile, ELF(Shdr) *sec, const char *name)
+{
+ unsigned long offset;
+ size_t len;
+
+ offset = (unsigned long)GET_LE(&sec->sh_offset);
+ len = (size_t)GET_LE(&sec->sh_size);
+
+ if (offset + len > data_len)
+ fail("section to extract overruns input data");
+
+ fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len);
+ BITSFUNC(copy)(outfile, data + offset, len);
+ fprintf(outfile, "\n};\n\n");
+}
+
static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *image_name)
@@ -15,7 +50,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
unsigned long i, syms_nr;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
- *alt_sec = NULL;
+ *alt_sec = NULL, *extable_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
INT_BITS syms[NSYMS] = {};
@@ -77,6 +112,8 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
if (!strcmp(secstrings + GET_LE(&sh->sh_name),
".altinstructions"))
alt_sec = sh;
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table"))
+ extable_sec = sh;
}
if (!symtab_hdr)
@@ -155,6 +192,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
(int)((unsigned char *)stripped_addr)[i]);
}
fprintf(outfile, "\n};\n\n");
+ if (extable_sec)
+ BITSFUNC(extract)(raw_addr, raw_len, outfile,
+ extable_sec, "extable");
fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
fprintf(outfile, "\t.data = raw_data,\n");
@@ -165,6 +205,14 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "\t.alt_len = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_size));
}
+ if (extable_sec) {
+ fprintf(outfile, "\t.extable_base = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_offset));
+ fprintf(outfile, "\t.extable_len = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_size));
+ fprintf(outfile, "\t.extable = extable,\n");
+ }
+
for (i = 0; i < NSYMS; i++) {
if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
index c3233ee98a6b..1bd068f72d4c 100644
--- a/arch/x86/entry/vdso/vdso32/sigreturn.S
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -18,6 +18,7 @@ __kernel_sigreturn:
movl $__NR_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_sigreturn:
+SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL)
nop
.size __kernel_sigreturn,.-.LSTART_sigreturn
@@ -29,6 +30,7 @@ __kernel_rt_sigreturn:
movl $__NR_rt_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_rt_sigreturn:
+SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL)
nop
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.previous
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 9185cb1d13b9..825e829ffff1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -89,30 +89,14 @@ static void vdso_fix_landing(const struct vdso_image *image,
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
- unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
const struct vdso_image *image = current->mm->context.vdso_image;
- if (image->size != new_size)
- return -EINVAL;
-
vdso_fix_landing(image, new_vma);
current->mm->context.vdso = (void __user *)new_vma->vm_start;
return 0;
}
-static int vvar_mremap(const struct vm_special_mapping *sm,
- struct vm_area_struct *new_vma)
-{
- const struct vdso_image *image = new_vma->vm_mm->context.vdso_image;
- unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
-
- if (new_size != -image->sym_vvar_start)
- return -EINVAL;
-
- return 0;
-}
-
#ifdef CONFIG_TIME_NS
static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
@@ -252,7 +236,6 @@ static const struct vm_special_mapping vdso_mapping = {
static const struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.fault = vvar_fault,
- .mremap = vvar_mremap,
};
/*
@@ -413,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp)
+ int uses_interp, bool x32)
{
#ifdef CONFIG_X86_X32_ABI
- if (test_thread_flag(TIF_X32)) {
+ if (x32) {
if (!vdso64_enabled)
return 0;
return map_vdso_randomized(&vdso_image_x32);
@@ -436,6 +419,21 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
}
#endif
+bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ unsigned long vdso = (unsigned long) current->mm->context.vdso;
+
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad ||
+ regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad)
+ return true;
+ }
+#endif
+ return false;
+}
+
#ifdef CONFIG_X86_64
static __init int vdso_setup(char *s)
{
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
new file mode 100644
index 000000000000..86a0e94f68df
--- /dev/null
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+/* Relative to %rbp. */
+#define SGX_ENCLAVE_OFFSET_OF_RUN 16
+
+/* The offsets relative to struct sgx_enclave_run. */
+#define SGX_ENCLAVE_RUN_TCS 0
+#define SGX_ENCLAVE_RUN_LEAF 8
+#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12
+#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14
+#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16
+#define SGX_ENCLAVE_RUN_USER_HANDLER 24
+#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */
+#define SGX_ENCLAVE_RUN_RESERVED_START 40
+#define SGX_ENCLAVE_RUN_RESERVED_END 256
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_sgx_enter_enclave)
+ /* Prolog */
+ .cfi_startproc
+ push %rbp
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rbp, 0
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ push %rbx
+ .cfi_rel_offset %rbx, -8
+
+ mov %ecx, %eax
+.Lenter_enclave:
+ /* EENTER <= function <= ERESUME */
+ cmp $EENTER, %eax
+ jb .Linvalid_input
+ cmp $ERESUME, %eax
+ ja .Linvalid_input
+
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx
+
+ /* Validate that the reserved area contains only zeros. */
+ mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx
+1:
+ cmpq $0, (%rcx, %rbx)
+ jne .Linvalid_input
+ add $8, %rbx
+ cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx
+ jne 1b
+
+ /* Load TCS and AEP */
+ mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx
+ lea .Lasync_exit_pointer(%rip), %rcx
+
+ /* Single ENCLU serving as both EENTER and AEP (ERESUME) */
+.Lasync_exit_pointer:
+.Lenclu_eenter_eresume:
+ enclu
+
+ /* EEXIT jumps here unless the enclave is doing something fancy. */
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set exit_reason. */
+ movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx)
+
+ /* Invoke userspace's exit handler if one was provided. */
+.Lhandle_exit:
+ cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx)
+ jne .Linvoke_userspace_handler
+
+ /* Success, in the sense that ENCLU was attempted. */
+ xor %eax, %eax
+
+.Lout:
+ pop %rbx
+ leave
+ .cfi_def_cfa %rsp, 8
+ ret
+
+ /* The out-of-line code runs with the pre-leave stack frame. */
+ .cfi_def_cfa %rbp, 16
+
+.Linvalid_input:
+ mov $(-EINVAL), %eax
+ jmp .Lout
+
+.Lhandle_exception:
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set the exception info. */
+ mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx)
+ mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx)
+ mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx)
+ mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx)
+ jmp .Lhandle_exit
+
+.Linvoke_userspace_handler:
+ /* Pass the untrusted RSP (at exit) to the callback via %rcx. */
+ mov %rsp, %rcx
+
+ /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */
+ mov %rbx, %rax
+
+ /* Save the untrusted RSP offset in %rbx (non-volatile register). */
+ mov %rsp, %rbx
+ and $0xf, %rbx
+
+ /*
+ * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned
+ * _after_ pushing the parameters on the stack, hence the bonus push.
+ */
+ and $-0x10, %rsp
+ push %rax
+
+ /* Push struct sgx_enclave_exception as a param to the callback. */
+ push %rax
+
+ /* Clear RFLAGS.DF per x86_64 ABI */
+ cld
+
+ /*
+ * Load the callback pointer to %rax and lfence for LVI (load value
+ * injection) protection before making the call.
+ */
+ mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax
+ lfence
+ call *%rax
+
+ /* Undo the post-exit %rsp adjustment. */
+ lea 0x10(%rsp, %rbx), %rsp
+
+ /*
+ * If the return from callback is zero or negative, return immediately,
+ * else re-execute ENCLU with the postive return value interpreted as
+ * the requested ENCLU function.
+ */
+ cmp $0, %eax
+ jle .Lout
+ jmp .Lenter_enclave
+
+ .cfi_endproc
+
+_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
+
+SYM_FUNC_END(__vdso_sgx_enter_enclave)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 44c33103a955..1b40b9297083 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -316,7 +316,7 @@ static struct vm_area_struct gate_vma __ro_after_init = {
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
- if (!mm || mm->context.ia32_compat)
+ if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL))
return NULL;
#endif
if (vsyscall_mode == NONE)
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 39eb276d0277..2c1791c4a518 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -538,7 +538,7 @@ static void amd_pmu_cpu_starting(int cpu)
if (!x86_pmu.amd_nb_constraints)
return;
- nb_id = amd_get_nb_id(cpu);
+ nb_id = topology_die_id(cpu);
WARN_ON_ONCE(nb_id == BAD_APICID);
for_each_online_cpu(i) {
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a88c94d65693..6ddeed3cd2ac 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -81,6 +81,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -253,6 +255,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i))
+ continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -665,6 +669,12 @@ void x86_pmu_disable_all(void)
}
}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ return static_call(x86_pmu_guest_get_msrs)(nr);
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
* after disable_all.
@@ -1174,7 +1184,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
/* All the metric events are mapped onto the fixed counter 3. */
idx = INTEL_PMC_IDX_FIXED_SLOTS;
- /* fall through */
+ fallthrough;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
@@ -1523,6 +1533,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1923,6 +1935,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+ static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -1930,6 +1944,13 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+static inline struct perf_guest_switch_msr *
+perf_guest_get_msrs_nop(int *nr)
+{
+ *nr = 0;
+ return NULL;
+}
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1995,12 +2016,17 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
+ if (!x86_pmu.guest_get_msrs)
+ x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop;
+
x86_pmu_static_call_update();
/*
@@ -2602,7 +2628,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
- if (!test_thread_flag(TIF_IA32))
+ if (user_64bit_mode(regs))
return 0;
cs_base = get_segment_base(regs->cs);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f1926e9f2143..5bac48d5c18e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = {
INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
- INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
+ INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */
+ INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
@@ -274,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+ EVENT_CONSTRAINT_END
+};
+
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -313,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
-EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
@@ -383,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+static __initconst const u64 spr_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe124,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_MISS) ] = 0xe424,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe12,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ [ C(RESULT_MISS) ] = 0xe13,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = 0xe11,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4c4,
+ [ C(RESULT_MISS) ] = 0x4c5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10001,
+ [ C(RESULT_MISS) ] = 0x3fbfc00001,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+ [ C(RESULT_MISS) ] = 0x3f3fc00002,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10c000001,
+ [ C(RESULT_MISS) ] = 0x3fb3000001,
+ },
+ },
+};
+
/*
* Notes on the events:
* - data reads do not include code reads (comparable to earlier tables)
@@ -1900,6 +2056,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs
},
};
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0");
+EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0");
+
+static struct attribute *tnt_events_attrs[] = {
+ EVENT_PTR(td_fe_bound_tnt),
+ EVENT_PTR(td_retiring_tnt),
+ EVENT_PTR(td_bad_spec_tnt),
+ EVENT_PTR(td_be_bound_tnt),
+ NULL,
+};
+
static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0),
@@ -2120,18 +2289,6 @@ static void intel_tfa_pmu_enable_all(int added)
intel_pmu_enable_all(added);
}
-static void enable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() |
- DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() &
- ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -2323,8 +2480,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
}
}
-static void update_saved_topdown_regs(struct perf_event *event,
- u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2333,7 +2490,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
event->hw.saved_slots = slots;
event->hw.saved_metric = metrics;
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2348,7 +2505,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
* The PERF_METRICS and Fixed counter 3 are read separately. The values may be
* modify by a NMI. PMU has to be disabled before calling this function.
*/
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2364,7 +2522,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
/* read PERF_METRICS */
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2390,7 +2548,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
* Don't need to reset the PERF_METRICS and Fixed counter 3.
* Because the values will be restored in next schedule in.
*/
- update_saved_topdown_regs(event, slots, metrics);
+ update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
@@ -2399,12 +2557,18 @@ static u64 icl_update_topdown_event(struct perf_event *event)
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
if (event)
- update_saved_topdown_regs(event, 0, 0);
+ update_saved_topdown_regs(event, 0, 0, metric_end);
}
return slots;
}
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+ return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+ x86_pmu.num_topdown_events - 1);
+}
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2559,8 +2723,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -2630,7 +2797,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
u64 pebs_enabled = cpuc->pebs_enabled;
handled++;
- x86_pmu.drain_pebs(regs);
+ x86_pmu.drain_pebs(regs, &data);
status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
@@ -2695,95 +2862,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
return handled;
}
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
- bool res;
-
- if (kstrtobool(s, &res))
- return -EINVAL;
-
- disable_counter_freezing = !res;
- return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int handled = 0;
- bool bts = false;
- u64 status;
- int pmu_enabled = cpuc->enabled;
- int loops = 0;
-
- /* PMU has been disabled because of counter freezing */
- cpuc->enabled = 0;
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- bts = true;
- intel_bts_disable_local();
- handled = intel_pmu_drain_bts_buffer();
- handled += intel_bts_interrupt();
- }
- status = intel_pmu_get_status();
- if (!status)
- goto done;
-again:
- intel_pmu_lbr_read();
- if (++loops > 100) {
- static bool warned;
-
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
-
-
- handled += handle_pmi_common(regs, status);
-done:
- /* Ack the PMI in the APIC */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /*
- * The counters start counting immediately while ack the status.
- * Make it as close as possible to IRET. This avoids bogus
- * freezing on Skylake CPUs.
- */
- if (status) {
- intel_pmu_ack_status(status);
- } else {
- /*
- * CPU may issues two PMIs very close to each other.
- * When the PMI handler services the first one, the
- * GLOBAL_STATUS is already updated to reflect both.
- * When it IRETs, the second PMI is immediately
- * handled and it sees clear status. At the meantime,
- * there may be a third PMI, because the freezing bit
- * isn't set since the ack in first PMI handlers.
- * Double check if there is more work to be done.
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
- }
-
- if (bts)
- intel_bts_enable_local();
- cpuc->enabled = pmu_enabled;
- return handled;
-}
-
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -3549,6 +3627,26 @@ static int core_pmu_hw_config(struct perf_event *event)
return intel_pmu_bts_config(event);
}
+#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
+ ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+ return is_metric_event(event) &&
+ event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3622,7 +3720,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.config & X86_ALL_EVENT_FLAGS)
return -EINVAL;
- if (is_metric_event(event)) {
+ if (is_available_metric_event(event)) {
struct perf_event *leader = event->group_leader;
/* The metric events don't support sampling. */
@@ -3651,6 +3749,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
}
+ /*
+ * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+ * doesn't function quite right. As a work-around it needs to always be
+ * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+ * The actual count of this second event is irrelevant it just needs
+ * to be active to make the first event function correctly.
+ *
+ * In a group, the auxiliary event must be in front of the load latency
+ * event. The rule is to simplify the implementation of the check.
+ * That's because perf cannot have a complete group at the moment.
+ */
+ if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+ is_mem_loads_event(event)) {
+ struct perf_event *leader = event->group_leader;
+ struct perf_event *sibling = NULL;
+
+ if (!is_mem_loads_aux_event(leader)) {
+ for_each_sibling_event(sibling, leader) {
+ if (is_mem_loads_aux_event(sibling))
+ break;
+ }
+ if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+ return -ENODATA;
+ }
+ }
+
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
return 0;
@@ -3666,26 +3791,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
-#ifdef CONFIG_RETPOLINE
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
-#endif
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-#ifdef CONFIG_RETPOLINE
- if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
- return intel_guest_get_msrs(nr);
- else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
- return core_guest_get_msrs(nr);
-#endif
- if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
- *nr = 0;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -3851,6 +3956,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = icl_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0. If a :ppp event which is not
+ * available on the GP counter 0, error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ if (c->idxmsk64 & BIT_ULL(0))
+ return &counter0_constraint;
+
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
@@ -3939,6 +4067,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
return max(left, 32ULL);
}
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+ if (event->attr.precise_ip == 3)
+ return max(left, 128ULL);
+
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -4080,9 +4216,6 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- if (x86_pmu.counter_freezing)
- enable_counter_freeze();
-
/* Disable perf metrics if any added CPU doesn't support it. */
if (x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
@@ -4153,9 +4286,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4383,6 +4513,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
@@ -4547,39 +4680,6 @@ static __init void intel_nehalem_quirk(void)
}
}
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
- {}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
- return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
- /* Check if it's already disabled */
- if (disable_counter_freezing)
- return;
-
- /*
- * If the system starts with the wrong ucode, leave the
- * counter-freezing feature permanently disabled.
- */
- if (intel_counter_freezing_broken()) {
- pr_info("PMU counter freezing disabled due to CPU errata,"
- "please upgrade microcode\n");
- x86_pmu.counter_freezing = false;
- x86_pmu.handle_irq = intel_pmu_handle_irq;
- }
-}
-
/*
* enable software workaround for errata:
* SNB: BJ122
@@ -4689,6 +4789,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
NULL,
};
+
+EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_spr),
+ EVENT_PTR(mem_ld_aux),
+ NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ EVENT_PTR(td_heavy_ops),
+ EVENT_PTR(td_br_mispredict),
+ EVENT_PTR(td_fetch_lat),
+ EVENT_PTR(td_mem_bound),
+ NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -4912,7 +5048,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
- unsigned int unused;
+ unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
int version, i;
@@ -4934,7 +5070,7 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
@@ -4958,15 +5094,15 @@ __init int intel_pmu_init(void)
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
*/
- if (version > 1) {
+ if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
- }
- if (version >= 4)
- x86_pmu.counter_freezing = !disable_counter_freezing;
+ fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+ } else if (version >= 5)
+ x86_pmu.num_counters_fixed = fls(fixed_mask);
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -4987,6 +5123,12 @@ __init int intel_pmu_init(void)
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+ if (version >= 5) {
+ x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated;
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ pr_cont(" AnyThread deprecated, ");
+ }
+
/*
* Install the hw-cache-events table:
*/
@@ -5089,7 +5231,6 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_D:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5116,7 +5257,6 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5167,6 +5307,7 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.get_event_constraints = tnt_get_event_constraints;
+ td_attr = tnt_events_attrs;
extra_attr = slm_format_attr;
pr_cont("Tremont events, ");
name = "Tremont";
@@ -5436,6 +5577,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ICELAKE:
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
+ case INTEL_FAM6_ROCKETLAKE:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -5458,15 +5600,53 @@ __init int intel_pmu_init(void)
mem_attr = icl_events_attrs;
td_attr = icl_td_events_attrs;
tsx_attr = icl_tsx_events_attrs;
- x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02);
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 4;
x86_pmu.update_topdown_event = icl_update_topdown_event;
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
pr_cont("Icelake events, ");
name = "icelake";
break;
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ pmem = true;
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ x86_pmu.event_constraints = intel_spr_event_constraints;
+ x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_spr_extra_regs;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = spr_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = spr_events_attrs;
+ td_attr = spr_td_events_attrs;
+ tsx_attr = spr_tsx_events_attrs;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = icl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5509,8 +5689,11 @@ __init int intel_pmu_init(void)
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
}
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
+
+ /* AnyThread may be deprecated on arch perfmon v5 or later */
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ x86_pmu.format_attrs = intel_arch_formats_attr;
if (x86_pmu.event_constraints) {
/*
@@ -5523,13 +5706,22 @@ __init int intel_pmu_init(void)
* events to the generic counters.
*/
if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+ c->idxmsk64 = 0;
c->weight = hweight64(c->idxmsk64);
continue;
}
- if (c->cmask == FIXED_EVENT_FLAGS
- && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+ if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
@@ -5575,13 +5767,6 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- /*
- * For arch perfmon 4 use counter freezing to avoid
- * several MSR accesses in the PMI.
- */
- if (x86_pmu.counter_freezing)
- x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
if (x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 442e1ed4acd4..407eee5f6f95 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -51,46 +51,46 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL
+ * ICL,TGL,RKL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT
+ * KBL,CML,ICL,TGL,TNT,RKL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL
+ * KBL,CML,ICL,TGL,RKL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
* Scope: Package (physical package)
*
*/
@@ -107,14 +107,14 @@
MODULE_LICENSE("GPL");
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __cstate_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
static ssize_t cstate_get_attr_cpumask(struct device *dev,
@@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315df1e16..7ebae1826403 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -36,7 +36,9 @@ union intel_x86_pebs_dse {
unsigned int ld_dse:4;
unsigned int ld_stlb_miss:1;
unsigned int ld_locked:1;
- unsigned int ld_reserved:26;
+ unsigned int ld_data_blk:1;
+ unsigned int ld_addr_blk:1;
+ unsigned int ld_reserved:24;
};
struct {
unsigned int st_l1d_hit:1;
@@ -45,6 +47,12 @@ union intel_x86_pebs_dse {
unsigned int st_locked:1;
unsigned int st_reserved2:26;
};
+ struct {
+ unsigned int st_lat_dse:4;
+ unsigned int st_lat_stlb_miss:1;
+ unsigned int st_lat_locked:1;
+ unsigned int ld_reserved3:26;
+ };
};
@@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status)
if (dse.ld_locked)
val |= P(LOCK, LOCKED);
+ /*
+ * Ice Lake and earlier models do not support block infos.
+ */
+ if (!x86_pmu.pebs_block) {
+ val |= P(BLK, NA);
+ return val;
+ }
+ /*
+ * bit 6: load was blocked since its data could not be forwarded
+ * from a preceding store
+ */
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+
+ /*
+ * bit 7: load was blocked due to potential address conflict with
+ * a preceding store
+ */
+ if (dse.ld_addr_blk)
+ val |= P(BLK, ADDR);
+
+ if (!dse.ld_data_blk && !dse.ld_addr_blk)
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.st_lat_dse];
+
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.st_lat_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.st_lat_locked)
+ val |= P(LOCK, LOCKED);
+
+ val |= P(BLK, NA);
+
return val;
}
@@ -642,8 +707,8 @@ int intel_pmu_drain_bts_buffer(void)
rcu_read_lock();
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size *
- (top - base - skip)))
+ if (perf_output_begin(&handle, &data, event,
+ header.size * (top - base - skip)))
goto unlock;
for (at = base; at < top; at++) {
@@ -670,7 +735,9 @@ unlock:
static inline void intel_pmu_drain_pebs_buffer(void)
{
- x86_pmu.drain_pebs(NULL);
+ struct perf_sample_data data;
+
+ x86_pmu.drain_pebs(NULL, &data);
}
/*
@@ -868,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+ INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -958,8 +1047,10 @@ static void adaptive_pebs_record_size_update(void)
}
#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
- PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
- PERF_SAMPLE_TRANSACTION)
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_WEIGHT_TYPE | \
+ PERF_SAMPLE_TRANSACTION | \
+ PERF_SAMPLE_DATA_PAGE_SIZE)
static u64 pebs_update_adaptive_cfg(struct perf_event *event)
{
@@ -984,7 +1075,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
(attr->sample_regs_intr & PEBS_GP_REGS);
- tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+ tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
((attr->config & INTEL_ARCH_EVENT_MASK) ==
x86_pmu.rtm_abort_event);
@@ -1259,7 +1350,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
old_to = to;
#ifdef CONFIG_X86_64
- is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+ is_64bit = kernel_ip(to) || any_64bit_mode(regs);
#endif
insn_init(&insn, kaddr, size, is_64bit);
insn_get_length(&insn);
@@ -1328,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
val = load_latency_data(aux);
+ else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+ val = store_latency_data(aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
@@ -1335,6 +1428,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
return val;
}
+#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_DATA_PAGE_SIZE)
+
static void setup_pebs_fixed_sample_data(struct perf_event *event,
struct pt_regs *iregs, void *__pebs,
struct perf_sample_data *data,
@@ -1362,8 +1459,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
/*
* Use latency for weight (only avail with PEBS-LL)
*/
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data->weight = pebs->lat;
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+ data->weight.full = pebs->lat;
/*
* data.data_src encodes the data source
@@ -1449,14 +1546,14 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
}
- if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+ if ((sample_type & PERF_SAMPLE_ADDR_TYPE) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+ data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
if (sample_type & PERF_SAMPLE_TRANSACTION)
data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1500,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#endif
}
+#define PEBS_LATENCY_MASK 0xffff
+#define PEBS_CACHE_LATENCY_OFFSET 32
+
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@@ -1570,14 +1670,32 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
if (format_size & PEBS_DATACFG_MEMINFO) {
- if (sample_type & PERF_SAMPLE_WEIGHT)
- data->weight = meminfo->latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 weight = meminfo->latency;
+
+ if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+ data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+ weight >>= PEBS_CACHE_LATENCY_OFFSET;
+ }
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = weight ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ } else {
+ data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ }
+ }
if (sample_type & PERF_SAMPLE_DATA_SRC)
data->data_src.val = get_data_src(event, meminfo->aux);
- if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
+ if (sample_type & PERF_SAMPLE_ADDR_TYPE)
data->addr = meminfo->address;
if (sample_type & PERF_SAMPLE_TRANSACTION)
@@ -1719,23 +1837,24 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0;
}
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs,
- void *base, void *top,
- int bit, int count,
- void (*setup_sample)(struct perf_event *,
- struct pt_regs *,
- void *,
- struct perf_sample_data *,
- struct pt_regs *))
+static __always_inline void
+__intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data,
+ void *base, void *top,
+ int bit, int count,
+ void (*setup_sample)(struct perf_event *,
+ struct pt_regs *,
+ void *,
+ struct perf_sample_data *,
+ struct pt_regs *))
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct perf_sample_data data;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
- struct pt_regs dummy_iregs;
+ static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
@@ -1752,14 +1871,14 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
iregs = &dummy_iregs;
while (count > 1) {
- setup_sample(event, iregs, at, &data, regs);
- perf_event_output(event, &data, regs);
+ setup_sample(event, iregs, at, data, regs);
+ perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
- setup_sample(event, iregs, at, &data, regs);
+ setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) {
/*
* The PEBS records may be drained in the non-overflow context,
@@ -1767,18 +1886,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* last record the same as other PEBS records, and doesn't
* invoke the generic overflow handler.
*/
- perf_event_output(event, &data, regs);
+ perf_event_output(event, data, regs);
} else {
/*
* All but the last records are processed.
* The last one is left to be able to call the overflow handler.
*/
- if (perf_event_overflow(event, &data, regs))
+ if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0);
}
}
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -1812,7 +1931,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
return;
}
- __intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+ __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
@@ -1835,7 +1954,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
}
}
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -1913,7 +2032,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
* that caused the PEBS record. It's called collision.
* If collision happened, the record will be dropped.
*/
- if (p->status != (1ULL << bit)) {
+ if (pebs_status != (1ULL << bit)) {
for_each_set_bit(i, (unsigned long *)&pebs_status, size)
error[i]++;
continue;
@@ -1937,19 +2056,19 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
if (error[bit]) {
perf_log_lost_samples(event, error[bit]);
- if (perf_event_account_interrupt(event))
+ if (iregs && perf_event_account_interrupt(event))
x86_pmu_stop(event, 0);
}
if (counts[bit]) {
- __intel_pmu_pebs_event(event, iregs, base,
+ __intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
}
}
}
-static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1997,7 +2116,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
if (WARN_ON_ONCE(!event->attr.precise_ip))
continue;
- __intel_pmu_pebs_event(event, iregs, base,
+ __intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..21890dacfcfe 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info)
return !(info & LBR_INFO_MISPRED);
}
-static __always_inline bool get_lbr_cycles(u64 info)
+static __always_inline u16 get_lbr_cycles(u64 info)
{
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
!(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
@@ -1221,7 +1221,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
* on 64-bit systems running 32-bit apps
*/
#ifdef CONFIG_X86_64
- is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+ is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
#endif
insn_init(&insn, addr, bytes_read, is64);
insn_get_opcode(&insn);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 86d012b3e0b4..33c8180d5a87 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
MODULE_LICENSE("GPL");
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);
- return phys_id;
+ return die_id;
}
static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);
end:
@@ -94,8 +94,8 @@ end:
return map;
}
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct uncore_event_desc *event =
container_of(attr, struct uncore_event_desc, attr);
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;
/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;
- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);
pci_set_drvdata(pdev, pmu->boxes[die]);
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];
- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;
box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}
static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;
/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;
while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;
- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
@@ -1636,6 +1620,11 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = {
.mmio_init = tgl_l_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
+ .cpu_init = tgl_uncore_cpu_init,
+ .pci_init = skl_uncore_pci_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1683,6 +1672,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 83d2a7d490e0..a3c6e1643ad2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};
struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -157,7 +156,7 @@ struct intel_uncore_box {
#define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2
struct uncore_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *config;
};
@@ -173,14 +172,14 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};
struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf);
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
{
@@ -201,14 +200,14 @@ extern int __uncore_max_dies;
}
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
static inline bool uncore_pmc_fixed(int idx)
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 39e632ed6ca9..51271288499e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -60,7 +60,8 @@
#define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12
#define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14
#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36
-
+#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43
+#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = {
NULL,
};
+static void rkl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
void tgl_uncore_cpu_init(void)
{
uncore_msr_uncores = tgl_msr_uncores;
@@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void)
icl_uncore_cbox.ops = &skl_uncore_msr_ops;
icl_uncore_clockbox.ops = &skl_uncore_msr_ops;
snb_uncore_arb.ops = &skl_uncore_msr_ops;
+ skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
}
enum {
@@ -475,7 +483,7 @@ enum perf_snb_uncore_imc_freerunning_types {
static struct freerunning_counters snb_uncore_imc_freerunning[] = {
[SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
0x0, 0x0, 1, 32 },
- [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
+ [SNB_PCI_UNCORE_IMC_DATA_WRITES] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
0x0, 0x0, 1, 32 },
[SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE,
0x0, 0x0, 1, 32 },
@@ -649,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
@@ -926,6 +934,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ },
};
@@ -1019,6 +1035,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
+ IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver),
+ IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver),
{ /* end marker */ }
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215d..b79951d0707c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1370,36 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;
- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1412,17 +1453,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4687,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;
while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 4be8f9cabd07..680404c58cb1 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ICELAKE_D:
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
+ case INTEL_FAM6_ROCKETLAKE:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ee2b9b9fc2a5..53b2b5fc23bc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -132,7 +133,7 @@ struct amd_nb {
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
- PERF_SAMPLE_PERIOD)
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE)
#define PEBS_GP_REGS \
((1ULL << PERF_REG_X86_AX) | \
@@ -443,6 +444,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+#define INTEL_PSD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -585,6 +590,7 @@ union perf_capabilities {
u64 pebs_baseline:1;
u64 perf_metrics:1;
u64 pebs_output_pt_available:1;
+ u64 anythread_deprecated:1;
};
u64 capabilities;
};
@@ -681,8 +687,7 @@ struct x86_pmu {
/* PMI handler bits */
unsigned int late_ack :1,
- enabled_ack :1,
- counter_freezing :1;
+ enabled_ack :1;
/*
* sysfs attrs
*/
@@ -723,11 +728,12 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1,
pebs_no_tlb :1,
- pebs_no_isolation :1;
+ pebs_no_isolation :1,
+ pebs_block :1;
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
- void (*drain_pebs)(struct pt_regs *regs);
+ void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
unsigned long large_pebs_flags;
@@ -775,6 +781,7 @@ struct x86_pmu {
/*
* Intel perf metrics
*/
+ int num_topdown_events;
u64 (*update_topdown_event)(struct perf_event *event);
int (*set_topdown_event_period)(struct perf_event *event);
@@ -870,6 +877,8 @@ do { \
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1059,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline bool fixed_counter_disabled(int i)
+{
+ return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -1156,6 +1170,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
extern struct event_constraint intel_icl_pebs_event_constraints[];
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_add(struct perf_event *event);
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index 136a1e847254..600bf8d15c0c 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
for (bit = 0; bit < cnt; bit++) {
if (!msr[bit].no_check) {
struct attribute_group *grp = msr[bit].grp;
+ u64 mask;
/* skip entry with no group */
if (!grp)
@@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
if (rdmsrl_safe(msr[bit].msr, &val))
continue;
+
+ mask = msr[bit].mask;
+ if (!mask)
+ mask = ~0ULL;
/* Disable zero counters if requested. */
- if (!zero && !val)
+ if (!zero && !(val & mask))
continue;
grp->is_visible = NULL;
diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h
index 4c8e0afc5fb5..261b9bda24e3 100644
--- a/arch/x86/events/probe.h
+++ b/arch/x86/events/probe.h
@@ -4,10 +4,11 @@
#include <linux/sysfs.h>
struct perf_msr {
- u64 msr;
- struct attribute_group *grp;
+ u64 msr;
+ struct attribute_group *grp;
bool (*test)(int idx, void *data);
- bool no_check;
+ bool no_check;
+ u64 mask;
};
unsigned long
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 7c0120e2e957..f42a70496a24 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -93,18 +93,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
- char *page) \
-{ \
- BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
- return sprintf(page, _format "\n"); \
-} \
-static struct kobj_attribute format_attr_##_var = \
- __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
#define RAPL_CNTR_WIDTH 32
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
@@ -441,7 +429,7 @@ static struct attribute_group rapl_pmu_events_group = {
.attrs = attrs_empty,
};
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+PMU_FORMAT_ATTR(event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
@@ -466,16 +454,9 @@ static struct attribute *rapl_events_cores[] = {
NULL,
};
-static umode_t
-rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return 0;
-}
-
static struct attribute_group rapl_events_cores_group = {
.name = "events",
.attrs = rapl_events_cores,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_pkg[] = {
@@ -488,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = {
static struct attribute_group rapl_events_pkg_group = {
.name = "events",
.attrs = rapl_events_pkg,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_ram[] = {
@@ -501,7 +481,6 @@ static struct attribute *rapl_events_ram[] = {
static struct attribute_group rapl_events_ram_group = {
.name = "events",
.attrs = rapl_events_ram,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_gpu[] = {
@@ -514,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = {
static struct attribute_group rapl_events_gpu_group = {
.name = "events",
.attrs = rapl_events_gpu,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_psys[] = {
@@ -527,7 +505,6 @@ static struct attribute *rapl_events_psys[] = {
static struct attribute_group rapl_events_psys_group = {
.name = "events",
.attrs = rapl_events_psys,
- .is_visible = rapl_not_visible,
};
static bool test_msr(int idx, void *data)
@@ -535,12 +512,23 @@ static bool test_msr(int idx, void *data)
return test_bit(idx, (unsigned long *) data);
}
+/* Only lower 32bits of the MSR represents the energy counter */
+#define RAPL_MSR_MASK 0xFFFFFFFF
+
static struct perf_msr intel_rapl_msrs[] = {
- [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr },
- [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
- [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr },
- [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr },
- [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr },
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+};
+
+static struct perf_msr intel_rapl_spr_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
};
/*
@@ -773,7 +761,7 @@ static struct rapl_model model_spr = {
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_fam17h = {
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 40e0e322161d..284e73661a18 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -273,11 +273,15 @@ void __init hv_apic_init(void)
pr_info("Hyper-V: Using enlightened APIC (%s mode)",
x2apic_enabled() ? "x2apic" : "xapic");
/*
- * With x2apic, architectural x2apic MSRs are equivalent to the
- * respective synthetic MSRs, so there's no need to override
- * the apic accessors. The only exception is
- * hv_apic_eoi_write, because it benefits from lazy EOI when
- * available, but it works for both xapic and x2apic modes.
+ * When in x2apic mode, don't use the Hyper-V specific APIC
+ * accessors since the field layout in the ICR register is
+ * different in x2apic mode. Furthermore, the architectural
+ * x2apic MSRs function just as well as the Hyper-V
+ * synthetic APIC MSRs, so there's no benefit in having
+ * separate Hyper-V accessors for x2apic mode. The only
+ * exception is hv_apic_eoi_write, because it benefits from
+ * lazy EOI when available, but the same accessor works for
+ * both xapic and x2apic because the field layout is the same.
*/
apic_set_eoi_write(hv_apic_eoi_write);
if (!x2apic_enabled()) {
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index e04d90af4c27..6375967a8244 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -16,6 +16,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/idtentry.h>
+#include <linux/kexec.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
@@ -26,6 +27,8 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
+int hyperv_init_cpuhp;
+
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -312,6 +315,25 @@ static struct syscore_ops hv_syscore_ops = {
.resume = hv_resume,
};
+static void (* __initdata old_setup_percpu_clockev)(void);
+
+static void __init hv_stimer_setup_percpu_clockev(void)
+{
+ /*
+ * Ignore any errors in setting up stimer clockevents
+ * as we can run with the LAPIC timer as a fallback.
+ */
+ (void)hv_stimer_alloc();
+
+ /*
+ * Still register the LAPIC timer, because the direct-mode STIMER is
+ * not supported by old versions of Hyper-V. This also allows users
+ * to switch to LAPIC timer via /sys, if they want to.
+ */
+ if (old_setup_percpu_clockev)
+ old_setup_percpu_clockev();
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -390,10 +412,14 @@ void __init hyperv_init(void)
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
/*
- * Ignore any errors in setting up stimer clockevents
- * as we can run with the LAPIC timer as a fallback.
+ * hyperv_init() is called before LAPIC is initialized: see
+ * apic_intr_mode_init() -> x86_platform.apic_post_init() and
+ * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER
+ * depends on LAPIC, so hv_stimer_alloc() should be called from
+ * x86_init.timers.setup_percpu_clockev.
*/
- (void)hv_stimer_alloc();
+ old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev;
+ x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev;
hv_apic_init();
@@ -401,6 +427,7 @@ void __init hyperv_init(void)
register_syscore_ops(&hv_syscore_ops);
+ hyperv_init_cpuhp = cpuhp;
return;
remove_cpuhp_state:
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 5208ba49c89a..2c87350c1fb0 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -66,11 +66,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
if (!hv_hypercall_pg)
goto do_native;
- if (cpumask_empty(cpus))
- return;
-
local_irq_save(flags);
+ /*
+ * Only check the mask _after_ interrupt has been disabled to avoid the
+ * mask changing under our feet.
+ */
+ if (cpumask_empty(cpus)) {
+ local_irq_restore(flags);
+ return;
+ }
+
flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 81cf22398cd1..5e3d9b7fd5fb 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -347,7 +347,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
*/
unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault);
+ unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault);
user_access_end();
if (__copy_siginfo_to_user32(&frame->info, &ksig->info))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 6d2df1ee427b..65064d9f7fa6 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -159,6 +159,8 @@ static inline u64 x86_default_get_root_pointer(void)
extern int x86_acpi_numa_init(void);
#endif /* CONFIG_ACPI_NUMA */
+struct cper_ia_proc_ctx;
+
#ifdef CONFIG_ACPI_APEI
static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
{
@@ -177,6 +179,15 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
*/
return PAGE_KERNEL_NOENC;
}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id);
+#else
+static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id)
+{
+ return -EINVAL;
+}
#endif
#define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4e3099d9ae62..412b51e059c8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -197,16 +197,6 @@ static inline bool apic_needs_pit(void) { return true; }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_X2APIC
-/*
- * Make previous memory operations globally visible before
- * sending the IPI through x2apic wrmsr. We need a serializing instruction or
- * mfence for this.
- */
-static inline void x2apic_wrmsr_fence(void)
-{
- asm volatile("mfence" : : : "memory");
-}
-
static inline void native_apic_msr_write(u32 reg, u32 v)
{
if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -259,6 +249,7 @@ static inline u64 native_x2apic_icr_read(void)
extern int x2apic_mode;
extern int x2apic_phys;
+extern void __init x2apic_set_max_apicid(u32 apicid);
extern void __init check_x2apic(void);
extern void x2apic_setup(void);
static inline int x2apic_enabled(void)
@@ -305,11 +296,10 @@ struct apic {
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
- /* dest_logical is used by the IPI functions */
- u32 dest_logical;
u32 disable_esr;
- u32 irq_delivery_mode;
- u32 irq_dest_mode;
+
+ enum apic_delivery_modes delivery_mode;
+ bool dest_mode_logical;
u32 (*calc_dest_apicid)(unsigned int cpu);
@@ -520,12 +510,10 @@ static inline void apic_smt_update(void) { }
#endif
struct msi_msg;
+struct irq_cfg;
-#ifdef CONFIG_PCI_MSI
-void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg);
-#else
-# define x86_vector_msi_compose_msg NULL
-#endif
+extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar);
extern void ioapic_zap_locks(void);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 05e694ed8386..5716f22f81ac 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -432,15 +432,13 @@ struct local_apic {
#define BAD_APICID 0xFFFFu
#endif
-enum ioapic_irq_destination_types {
- dest_Fixed = 0,
- dest_LowestPrio = 1,
- dest_SMI = 2,
- dest__reserved_1 = 3,
- dest_NMI = 4,
- dest_INIT = 5,
- dest__reserved_2 = 6,
- dest_ExtINT = 7
+enum apic_delivery_modes {
+ APIC_DELIVERY_MODE_FIXED = 0,
+ APIC_DELIVERY_MODE_LOWESTPRIO = 1,
+ APIC_DELIVERY_MODE_SMI = 2,
+ APIC_DELIVERY_MODE_NMI = 4,
+ APIC_DELIVERY_MODE_INIT = 5,
+ APIC_DELIVERY_MODE_EXTINT = 7,
};
#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b6cac6e9bb70..f732741ad7c7 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -199,7 +199,7 @@ static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
- return try_cmpxchg(&v->counter, old, new);
+ return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 809bd010a751..7886d0578fc9 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -187,7 +187,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
- return try_cmpxchg(&v->counter, old, new);
+ return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 7f828fe49797..4819d5e5a335 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -84,4 +84,22 @@ do { \
#include <asm-generic/barrier.h>
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ asm volatile("mfence; lfence" : : : "memory");
+}
+
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index 86b63c7feab7..86b2e0dcc4bf 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_CACHEINFO_H
#define _ASM_X86_CACHEINFO_H
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index a8bfac131256..4d4ec5cbdc51 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -221,7 +221,7 @@ extern void __add_wrong_size(void)
#define __try_cmpxchg(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
-#define try_cmpxchg(ptr, pold, new) \
+#define arch_try_cmpxchg(ptr, pold, new) \
__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
/*
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 0e327a01f50f..be09c7eac89f 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -159,17 +159,6 @@ struct compat_shmid64_ds {
compat_ulong_t __unused5;
};
-/*
- * The type of struct elf_prstatus.pr_reg in compatible core dumps.
- */
-typedef struct user_regs_struct compat_elf_gregset_t;
-
-/* Full regset -- prstatus on x32, otherwise on ia32 */
-#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
-#define SET_PR_FPVALID(S, V, R) \
- do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
- while (0)
-
#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
@@ -177,14 +166,13 @@ typedef struct user_regs_struct compat_elf_gregset_t;
static inline void __user *arch_compat_alloc_user_space(long len)
{
- compat_uptr_t sp;
-
- if (test_thread_flag(TIF_IA32)) {
- sp = task_pt_regs(current)->sp;
- } else {
- /* -128 for the x32 ABI redzone */
- sp = task_pt_regs(current)->sp - 128;
- }
+ compat_uptr_t sp = task_pt_regs(current)->sp;
+
+ /*
+ * -128 for the x32 ABI redzone. For IA32, it is not strictly
+ * necessary, but not harmful.
+ */
+ sp -= 128;
return (void __user *)round_down(sp - len, 16);
}
diff --git a/arch/x86/include/asm/copy_mc_test.h b/arch/x86/include/asm/copy_mc_test.h
deleted file mode 100644
index e4991ba96726..000000000000
--- a/arch/x86/include/asm/copy_mc_test.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _COPY_MC_TEST_H_
-#define _COPY_MC_TEST_H_
-
-#ifndef __ASSEMBLY__
-#ifdef CONFIG_COPY_MC_TEST
-extern unsigned long copy_mc_test_src;
-extern unsigned long copy_mc_test_dst;
-
-static inline void copy_mc_inject_src(void *addr)
-{
- if (addr)
- copy_mc_test_src = (unsigned long) addr;
- else
- copy_mc_test_src = ~0UL;
-}
-
-static inline void copy_mc_inject_dst(void *addr)
-{
- if (addr)
- copy_mc_test_dst = (unsigned long) addr;
- else
- copy_mc_test_dst = ~0UL;
-}
-#else /* CONFIG_COPY_MC_TEST */
-static inline void copy_mc_inject_src(void *addr)
-{
-}
-
-static inline void copy_mc_inject_dst(void *addr)
-{
-}
-#endif /* CONFIG_COPY_MC_TEST */
-
-#else /* __ASSEMBLY__ */
-#include <asm/export.h>
-
-#ifdef CONFIG_COPY_MC_TEST
-.macro COPY_MC_TEST_CTL
- .pushsection .data
- .align 8
- .globl copy_mc_test_src
- copy_mc_test_src:
- .quad 0
- EXPORT_SYMBOL_GPL(copy_mc_test_src)
- .globl copy_mc_test_dst
- copy_mc_test_dst:
- .quad 0
- EXPORT_SYMBOL_GPL(copy_mc_test_dst)
- .popsection
-.endm
-
-.macro COPY_MC_TEST_SRC reg count target
- leaq \count(\reg), %r9
- cmp copy_mc_test_src, %r9
- ja \target
-.endm
-
-.macro COPY_MC_TEST_DST reg count target
- leaq \count(\reg), %r9
- cmp copy_mc_test_dst, %r9
- ja \target
-.endm
-#else
-.macro COPY_MC_TEST_CTL
-.endm
-
-.macro COPY_MC_TEST_SRC reg count target
-.endm
-
-.macro COPY_MC_TEST_DST reg count target
-.endm
-#endif /* CONFIG_COPY_MC_TEST */
-#endif /* __ASSEMBLY__ */
-#endif /* _COPY_MC_TEST_H_ */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 59bf91c57aa8..1728d4ce5730 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,6 +30,7 @@ enum cpuid_leafs
CPUID_7_ECX,
CPUID_8000_0007_EBX,
CPUID_7_EDX,
+ CPUID_8000_001F_EAX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..1feb6c089ba2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 19 /* N 32-bit words worth of info */
+#define NCAPINTS 20 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -96,7 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */
+/* FREE! ( 3*32+17) */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -201,7 +201,7 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+/* FREE! ( 7*32+10) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
@@ -211,7 +211,7 @@
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
+/* FREE! ( 7*32+20) */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -236,11 +236,11 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
+#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */
#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
@@ -356,6 +356,7 @@
#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */
+#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */
/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
@@ -374,6 +375,7 @@
#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
@@ -381,6 +383,13 @@
#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
+#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5861d34f9771..b7dd944dc867 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,6 +62,12 @@
# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#endif
+#ifdef CONFIG_X86_SGX
+# define DISABLE_SGX 0
+#else
+# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -74,7 +80,7 @@
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
-#define DISABLED_MASK9 (DISABLE_SMAP)
+#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 0
#define DISABLED_MASK12 0
@@ -85,6 +91,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define DISABLED_MASK19 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index bc9758ef292e..4d0b126835b8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -12,6 +12,7 @@
#include <linux/pgtable.h>
extern unsigned long efi_fw_vendor, efi_config_table;
+extern unsigned long efi_mixed_mode_stack_pa;
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table;
#f " called with too many arguments (" #p ">" #n ")"); \
})
+static inline void efi_fpu_begin(void)
+{
+ /*
+ * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires
+ * that FCW and MXCSR (64-bit) must be initialized prior to calling
+ * UEFI code. (Oddly the spec does not require that the FPU stack
+ * be empty.)
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
+
+static inline void efi_fpu_end(void)
+{
+ kernel_fpu_end();
+}
+
#ifdef CONFIG_X86_32
#define arch_efi_call_virt_setup() \
({ \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
})
#define arch_efi_call_virt_teardown() \
({ \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#define arch_efi_call_virt(p, f, args...) p->f(args)
@@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
__efi_call(__VA_ARGS__); \
})
-/*
- * struct efi_scratch - Scratch space used while switching to/from efi_mm
- * @phys_stack: stack used during EFI Mixed Mode
- * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm
- */
-struct efi_scratch {
- u64 phys_stack;
- struct mm_struct *prev_mm;
-} __packed;
-
#define arch_efi_call_virt_setup() \
({ \
efi_sync_low_kernel_mappings(); \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
- efi_switch_mm(&efi_mm); \
+ efi_enter_mm(); \
})
#define arch_efi_call_virt(p, f, args...) \
@@ -117,9 +124,9 @@ struct efi_scratch {
#define arch_efi_call_virt_teardown() \
({ \
- efi_switch_mm(efi_scratch.prev_mm); \
+ efi_leave_mm(); \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#ifdef CONFIG_KASAN
@@ -136,7 +143,6 @@ struct efi_scratch {
#endif /* CONFIG_X86_32 */
-extern struct efi_scratch efi_scratch;
extern int __init efi_memblock_x86_reserve_range(void);
extern void __init efi_print_memmap(void);
extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void);
-extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
+void efi_enter_mm(void);
+void efi_leave_mm(void);
+
/* kexec external ABI */
struct efi_setup_data {
u64 fw_vendor;
@@ -213,8 +221,6 @@ static inline bool efi_is_64bit(void)
static inline bool efi_is_native(void)
{
- if (!IS_ENABLED(CONFIG_X86_64))
- return true;
return efi_is_64bit();
}
@@ -382,4 +388,7 @@ static inline void efi_fake_memmap_early(void)
}
#endif
+#define arch_ima_efi_boot_mode \
+ ({ extern struct boot_params boot_params; boot_params.secure_boot; })
+
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b9a5d488f1a5..9224d40cdefe 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -186,8 +186,9 @@ static inline void elf_common_init(struct thread_struct *t,
#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
elf_common_init(&current->thread, regs, __USER_DS)
-void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread compat_start_thread
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32);
+#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \
+ compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64)
void set_personality_ia32(bool);
#define COMPAT_SET_PERSONALITY(ex) \
@@ -361,9 +362,9 @@ do { \
#define AT_SYSINFO 32
#define COMPAT_ARCH_DLINFO \
-if (test_thread_flag(TIF_X32)) \
+if (exec->e_machine == EM_X86_64) \
ARCH_DLINFO_X32; \
-else \
+else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
ARCH_DLINFO_IA32
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
@@ -382,8 +383,12 @@ struct linux_binprm;
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
+ int uses_interp, bool x32);
+#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
+ compat_arch_setup_additional_pages(bprm, interpreter, \
+ (ex->e_machine == EM_X86_64))
+
+extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs);
/* Do not change the values. See get_align_mask() */
enum align_flags {
diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h
new file mode 100644
index 000000000000..f1b6c7a8d8fc
--- /dev/null
+++ b/arch/x86/include/asm/elfcore-compat.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_ELFCORE_COMPAT_H
+#define _ASM_X86_ELFCORE_COMPAT_H
+
+#include <asm/user32.h>
+
+/*
+ * On amd64 we have two 32bit ABIs - i386 and x32. The latter
+ * has bigger registers, so we use it for compat_elf_regset_t.
+ * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID
+ * are used to choose the size and location of ->pr_fpvalid of
+ * the layout actually used.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+struct i386_elf_prstatus
+{
+ struct compat_elf_prstatus_common common;
+ struct user_regs_struct32 pr_reg;
+ compat_int_t pr_fpvalid;
+};
+
+#define PRSTATUS_SIZE \
+ (user_64bit_mode(task_pt_regs(current)) \
+ ? sizeof(struct compat_elf_prstatus) \
+ : sizeof(struct i386_elf_prstatus))
+#define SET_PR_FPVALID(S) \
+ (*(user_64bit_mode(task_pt_regs(current)) \
+ ? &(S)->pr_fpvalid \
+ : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1)
+
+#endif
diff --git a/arch/x86/include/asm/enclu.h b/arch/x86/include/asm/enclu.h
new file mode 100644
index 000000000000..b1314e41a744
--- /dev/null
+++ b/arch/x86/include/asm/enclu.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ENCLU_H
+#define _ASM_X86_ENCLU_H
+
+#define EENTER 0x02
+#define ERESUME 0x03
+#define EEXIT 0x04
+
+#endif /* _ASM_X86_ENCLU_H */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 6fe54b2813c1..2b87b191b3b8 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -43,8 +43,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
}
#define arch_check_user_regs arch_check_user_regs
-#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP)
-
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
unsigned long ti_work)
{
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 77217bd292bd..9f1a0a987e5e 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,13 +14,20 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+#include <asm/kmap_size.h>
+
/*
* Exposed to assembly code for setting up initial page tables. Cannot be
* calculated in assembly code (fixmap entries are an enum), but is sanity
* checked in the actual fixmap C code to make sure that the fixmap is
* covered fully.
*/
-#define FIXMAP_PMD_NUM 2
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define FIXMAP_PMD_NUM 2
+#else
+# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
+# define FIXMAP_PMD_NUM (KM_PMDS + 2)
+#endif
/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
#define FIXMAP_PMD_TOP 507
@@ -31,7 +38,6 @@
#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
-#include <asm/kmap_types.h>
#else
#include <uapi/asm/vsyscall.h>
#endif
@@ -92,9 +98,9 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_KMAP_LOCAL
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
#ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
#endif
@@ -151,7 +157,6 @@ extern void reserve_top_address(unsigned long reserve);
extern int fixmaps_set;
-extern pte_t *kmap_pte;
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index dcd9503b1098..ed33a14188f6 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -16,30 +16,68 @@
* Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
* disables preemption so be careful if you intend to use it for long periods
* of time.
- * If you intend to use the FPU in softirq you need to check first with
+ * If you intend to use the FPU in irq/softirq you need to check first with
* irq_fpu_usable() if it is possible.
*/
-extern void kernel_fpu_begin(void);
+
+/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
+#define KFPU_387 _BITUL(0) /* 387 state will be initialized */
+#define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */
+
+extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);
+/* Code that is unaware of kernel_fpu_begin_mask() can use this */
+static inline void kernel_fpu_begin(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * Any 64-bit code that uses 387 instructions must explicitly request
+ * KFPU_387.
+ */
+ kernel_fpu_begin_mask(KFPU_MXCSR);
+#else
+ /*
+ * 32-bit kernel code may use 387 operations as well as SSE2, etc,
+ * as long as it checks that the CPU has the required capability.
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+#endif
+}
+
/*
* Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
* A context switch will (and softirq might) save CPU's FPU registers to
* fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
* a random state.
+ *
+ * local_bh_disable() protects against both preemption and soft interrupts
+ * on !RT kernels.
+ *
+ * On RT kernels local_bh_disable() is not sufficient because it only
+ * serializes soft interrupt related sections via a local lock, but stays
+ * preemptible. Disabling preemption is the right choice here as bottom
+ * half processing is always in thread context on RT kernels so it
+ * implicitly prevents bottom half processing as well.
+ *
+ * Disabling preemption also serializes against kernel_fpu_begin().
*/
static inline void fpregs_lock(void)
{
- preempt_disable();
- local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_disable();
+ else
+ preempt_disable();
}
static inline void fpregs_unlock(void)
{
- local_bh_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_enable();
+ else
+ preempt_enable();
}
#ifdef CONFIG_X86_DEBUG_FPU
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 84b9449be080..9f3130f40807 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -41,6 +41,24 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned
regs->orig_ax = addr;
}
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+struct ftrace_regs {
+ struct pt_regs regs;
+};
+
+static __always_inline struct pt_regs *
+arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+ /* Only when FL_SAVE_REGS is set, cs will be non zero */
+ if (!fregs->regs.cs)
+ return NULL;
+ return &fregs->regs;
+}
+
+#define ftrace_instruction_pointer_set(fregs, _ip) \
+ do { (fregs)->regs.ip = (_ip); } while (0)
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
struct dyn_arch_ftrace {
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 0f420b24e0fc..032e020853aa 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -23,7 +23,6 @@
#include <linux/interrupt.h>
#include <linux/threads.h>
-#include <asm/kmap_types.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/fixmap.h>
@@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn;
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-void *kmap_atomic_pfn(unsigned long pfn);
-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-
#define flush_cache_kmaps() do { } while (0)
+#define arch_kmap_local_post_map(vaddr, pteval) \
+ arch_flush_lazy_mmu_mode()
+
+#define arch_kmap_local_post_unmap(vaddr) \
+ do { \
+ flush_tlb_one_kernel((vaddr)); \
+ arch_flush_lazy_mmu_mode(); \
+ } while (0)
+
extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
unsigned long end_pfn);
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 6352dee37cda..ab9f3dd87c80 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,17 +74,6 @@ extern void hpet_disable(void);
extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
-struct irq_data;
-struct hpet_channel;
-struct irq_domain;
-
-extern void hpet_msi_unmask(struct irq_data *data);
-extern void hpet_msi_mask(struct irq_data *data);
-extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg);
-extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
-extern int hpet_assign_irq(struct irq_domain *domain,
- struct hpet_channel *hc, int dev_num);
-
#ifdef CONFIG_HPET_EMULATE_RTC
#include <linux/interrupt.h>
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a4aeeaace040..d465ece58151 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,18 +39,16 @@ enum irq_alloc_type {
X86_IRQ_ALLOC_TYPE_PCI_MSI,
X86_IRQ_ALLOC_TYPE_PCI_MSIX,
X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_AMDVI,
X86_IRQ_ALLOC_TYPE_UV,
- X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT,
- X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
};
struct ioapic_alloc_info {
- int pin;
- int node;
- u32 trigger : 1;
- u32 polarity : 1;
- u32 valid : 1;
- struct IO_APIC_route_entry *entry;
+ int pin;
+ int node;
+ u32 is_level : 1;
+ u32 active_low : 1;
+ u32 valid : 1;
};
struct uv_alloc_info {
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 0ed20e8bba9e..6bf42aed387e 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -23,6 +23,13 @@
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
+#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081
+#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */
+
+#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082
+/* Support for the extended IOAPIC RTE format */
+#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2)
+
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
#define HYPERV_CPUID_MIN 0x40000005
#define HYPERV_CPUID_MAX 0x4000ffff
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b2442eb0ac2f..41e2e2e1b439 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -11,9 +11,6 @@
#include <asm/irq_stack.h>
-bool idtentry_enter_nmi(struct pt_regs *regs);
-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
-
/**
* DECLARE_IDTENTRY - Declare functions for simple IDT entry points
* No error code pushed by hardware
@@ -588,6 +585,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
#else
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
+#endif
#endif
/* NMI */
@@ -608,6 +608,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
+#endif
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -616,6 +619,7 @@ DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
+DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
#endif
/* Device interrupts common/spurious */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 5c1ae3eff9d4..a8c3d284fa46 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -201,6 +201,21 @@ static inline int insn_offset_immediate(struct insn *insn)
return insn_offset_displacement(insn) + insn->displacement.nbytes;
}
+/**
+ * for_each_insn_prefix() -- Iterate prefixes in the instruction
+ * @insn: Pointer to struct insn.
+ * @idx: Index storage.
+ * @prefix: Prefix byte.
+ *
+ * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
+ * and the index is stored in @idx (note that this @idx is just for a cursor,
+ * do not change it.)
+ * Since prefixes.nbytes can be bigger than 4 if some prefixes
+ * are repeated, it cannot be used for looping over the prefixes.
+ */
+#define for_each_insn_prefix(insn, idx, prefix) \
+ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+
#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index bd7f02480ca1..438ccd4f3cc4 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -143,21 +143,6 @@
.macro MODRM mod opd1 opd2
.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
.endm
-
-.macro RDPID opd
- REG_TYPE rdpid_opd_type \opd
- .if rdpid_opd_type == REG_TYPE_R64
- R64_NUM rdpid_opd \opd
- .else
- R32_NUM rdpid_opd \opd
- .endif
- .byte 0xf3
- .if rdpid_opd > 7
- PFX_REX rdpid_opd 0
- .endif
- .byte 0x0f, 0xc7
- MODRM 0xc0 rdpid_opd 0x7
-.endm
#endif
#endif
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5e658ba2654a..9abe842dbd84 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -97,6 +97,7 @@
#define INTEL_FAM6_LAKEFIELD 0x8A
#define INTEL_FAM6_ALDERLAKE 0x97
+#define INTEL_FAM6_ALDERLAKE_L 0x9A
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a1a26f6d3aa4..437aa8d00e53 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -13,15 +13,6 @@
* Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
*/
-/* I/O Unit Redirection Table */
-#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
-#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
-#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
-#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
-#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
-#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
-#define IO_APIC_REDIR_MASKED (1 << 16)
-
/*
* The structure of the IO-APIC:
*/
@@ -65,52 +56,40 @@ union IO_APIC_reg_03 {
};
struct IO_APIC_route_entry {
- __u32 vector : 8,
- delivery_mode : 3, /* 000: FIXED
- * 001: lowest prio
- * 111: ExtINT
- */
- dest_mode : 1, /* 0: physical, 1: logical */
- delivery_status : 1,
- polarity : 1,
- irr : 1,
- trigger : 1, /* 0: edge, 1: level */
- mask : 1, /* 0: enabled, 1: disabled */
- __reserved_2 : 15;
-
- __u32 __reserved_3 : 24,
- dest : 8;
-} __attribute__ ((packed));
-
-struct IR_IO_APIC_route_entry {
- __u64 vector : 8,
- zero : 3,
- index2 : 1,
- delivery_status : 1,
- polarity : 1,
- irr : 1,
- trigger : 1,
- mask : 1,
- reserved : 31,
- format : 1,
- index : 15;
+ union {
+ struct {
+ u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ delivery_status : 1,
+ active_low : 1,
+ irr : 1,
+ is_level : 1,
+ masked : 1,
+ reserved_0 : 15,
+ reserved_1 : 17,
+ virt_destid_8_14 : 7,
+ destid_0_7 : 8;
+ };
+ struct {
+ u64 ir_shared_0 : 8,
+ ir_zero : 3,
+ ir_index_15 : 1,
+ ir_shared_1 : 5,
+ ir_reserved_0 : 31,
+ ir_format : 1,
+ ir_index_0_14 : 15;
+ };
+ struct {
+ u64 w1 : 32,
+ w2 : 32;
+ };
+ };
} __attribute__ ((packed));
struct irq_alloc_info;
struct ioapic_domain_cfg;
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-#define IOAPIC_MASKED 1
-#define IOAPIC_UNMASKED 0
-
-#define IOAPIC_POL_HIGH 0
-#define IOAPIC_POL_LOW 1
-
-#define IOAPIC_DEST_MODE_PHYSICAL 0
-#define IOAPIC_DEST_MODE_LOGICAL 1
-
#define IOAPIC_MAP_ALLOC 0x1
#define IOAPIC_MAP_CHECK 0x2
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index bacf68c4d70e..e2de092fc38c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -9,19 +9,14 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
+#include <linux/highmem.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
-void
-iounmap_atomic(void __iomem *kvaddr);
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
-int
-iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
-
-void
-iomap_free(resource_size_t base, unsigned long size);
+void iomap_free(resource_size_t base, unsigned long size);
#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f..76d389691b5b 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -40,8 +40,6 @@ extern void native_init_IRQ(void);
extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
-extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
-
extern void init_ISA_irqs(void);
extern void __init init_IRQ(void);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index af4a151d70b3..7cc49432187f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,9 +44,6 @@ extern int irq_remapping_reenable(int);
extern int irq_remap_enable_fault_handling(void);
extern void panic_if_irq_remap(const char *msg);
-extern struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info);
-
/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
extern struct irq_domain *
arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
@@ -71,11 +68,5 @@ static inline void panic_if_irq_remap(const char *msg)
{
}
-static inline struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info)
-{
- return NULL;
-}
-
#endif /* CONFIG_IRQ_REMAP */
#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index cd684d45cb5f..125c23b7bad3 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -12,6 +12,9 @@ enum {
X86_IRQ_ALLOC_LEGACY = 0x2,
};
+extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
+extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec);
+
extern struct irq_domain *x86_vector_domain;
extern void init_irq_alloc_info(struct irq_alloc_info *info,
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2dfc8d380dab..144d70ea4393 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
-extern inline void native_restore_fl(unsigned long flags);
-extern inline void native_restore_fl(unsigned long flags)
-{
- asm volatile("push %0 ; popf"
- : /* no output */
- :"g" (flags)
- :"memory", "cc");
-}
-
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
@@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void)
return native_save_fl();
}
-static __always_inline void arch_local_irq_restore(unsigned long flags)
-{
- native_restore_fl(flags);
-}
-
static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
@@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
-#define SWAPGS swapgs
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-
#define INTERRUPT_RETURN jmp native_iret
-#define USERGS_SYSRET64 \
- swapgs; \
- sysretq;
-#define USERGS_SYSRET32 \
- swapgs; \
- sysretl
#else
#define INTERRUPT_RETURN iret
@@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
+
+static __always_inline void arch_local_irq_restore(unsigned long flags)
+{
+ if (!arch_irqs_disabled_flags(flags))
+ arch_local_irq_enable();
+}
+#else
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PV
+#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#else
+#define SWAPGS swapgs
+#endif
+#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
deleted file mode 100644
index 04ab8266e347..000000000000
--- a/arch/x86/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_KMAP_TYPES_H
-#define _ASM_X86_KMAP_TYPES_H
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-#define __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 991a7ad540c7..d20a3d6be36e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -58,14 +58,17 @@ struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = false: This instruction type is not boostable.
- * boostable = true: This instruction has been boosted: we have
+ * boostable = 0: This instruction type is not boostable.
+ * boostable = 1: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
* a post_handler).
*/
- bool boostable;
- bool if_modifier;
+ unsigned boostable:1;
+ unsigned if_modifier:1;
+ unsigned is_call:1;
+ unsigned is_pushf:1;
+ unsigned is_abs_ip:1;
/* Number of bytes of text poked */
int tp_len;
};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d44858b69353..3d6616f6f6ef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -614,6 +614,7 @@ struct kvm_vcpu_arch {
struct kvm_pio_request pio;
void *pio_data;
+ void *guest_ins_data;
u8 event_exit_inst_len;
@@ -639,6 +640,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
+ unsigned long cr3_lm_rsvd_bits;
int maxphyaddr;
int max_tdp_level;
@@ -804,6 +806,9 @@ struct kvm_vcpu_arch {
*/
bool enforce;
} pv_cpuid;
+
+ /* Protected Guests */
+ bool guest_state_protected;
};
struct kvm_lpage_info {
@@ -1005,9 +1010,21 @@ struct kvm_arch {
*/
bool tdp_mmu_enabled;
- /* List of struct tdp_mmu_pages being used as roots */
+ /*
+ * List of struct kvmp_mmu_pages being used as roots.
+ * All struct kvm_mmu_pages in the list should have
+ * tdp_mmu_page set.
+ * All struct kvm_mmu_pages in the list should have a positive
+ * root_count except when a thread holds the MMU lock and is removing
+ * an entry from the list.
+ */
struct list_head tdp_mmu_roots;
- /* List of struct tdp_mmu_pages not being used as roots */
+
+ /*
+ * List of struct kvmp_mmu_pages not being used as roots.
+ * All struct kvm_mmu_pages in the list should have
+ * tdp_mmu_page set and a root_count of 0.
+ */
struct list_head tdp_mmu_pages;
};
@@ -1087,7 +1104,7 @@ struct kvm_x86_ops {
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
bool (*cpu_has_accelerated_tpr)(void);
- bool (*has_emulated_msr)(u32 index);
+ bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
unsigned int vm_size;
@@ -1114,7 +1131,8 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -1230,6 +1248,7 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
+ int (*cpu_dirty_log_size)(void);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
@@ -1279,6 +1298,9 @@ struct kvm_x86_ops {
void (*migrate_timers)(struct kvm_vcpu *vcpu);
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+ int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
+
+ void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
};
struct kvm_x86_nested_ops {
@@ -1460,6 +1482,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -1469,6 +1492,10 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
+
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
@@ -1655,6 +1682,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
@@ -1694,7 +1722,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
int kvm_is_in_guest(void);
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
@@ -1741,4 +1770,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#define GET_SMSTATE(type, buf, offset) \
(*(type *)((buf) + (offset) - 0x7e00))
+int kvm_cpu_dirty_log_size(void);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 1fde1ab6559e..7c5cc6660e4b 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -12,9 +12,9 @@
#include <asm/setup.h>
#include <linux/ftrace.h>
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
{
- regs->ip = ip;
+ ftrace_instruction_pointer_set(fregs, ip);
}
#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
deleted file mode 100644
index 36c93b5cc239..000000000000
--- a/arch/x86/include/asm/local64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index a0f147893a04..ddfb3cad8dff 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -177,7 +177,8 @@ enum mce_notifier_prios {
MCE_PRIO_EXTLOG,
MCE_PRIO_UC,
MCE_PRIO_EARLY,
- MCE_PRIO_CEC
+ MCE_PRIO_CEC,
+ MCE_PRIO_HIGHEST = MCE_PRIO_CEC
};
struct notifier_block;
@@ -198,16 +199,22 @@ static inline void enable_copy_mc_fragile(void)
}
#endif
+struct cper_ia_proc_ctx;
+
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
void mcheck_vendor_init_severity(void);
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
static inline void mcheck_vendor_init_severity(void) {}
+static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id) { return -EINVAL; }
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -282,28 +289,6 @@ extern void (*mce_threshold_vector)(void);
extern void (*deferred_error_int_vector)(void);
/*
- * Thermal handler
- */
-
-void intel_init_thermal(struct cpuinfo_x86 *c);
-
-/* Interrupt Handler for core thermal thresholds */
-extern int (*platform_thermal_notify)(__u64 msr_val);
-
-/* Interrupt Handler for package thermal thresholds */
-extern int (*platform_thermal_package_notify)(__u64 msr_val);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-extern bool (*platform_thermal_package_rate_control)(void);
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-extern void mcheck_intel_therm_init(void);
-#else
-static inline void mcheck_intel_therm_init(void) { }
-#endif
-
-/*
* Used by APEI to report memory error via /dev/mcelog
*/
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 2f62bbdd9d12..31c4df123aa0 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -37,6 +37,7 @@ void __init sme_map_bootdata(char *real_mode_data);
void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
+void __init sev_setup_arch(void);
void __init sme_encrypt_kernel(struct boot_params *bp);
void __init sme_enable(struct boot_params *bp);
@@ -69,6 +70,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
+static inline void __init sev_setup_arch(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80..ab45a220fac4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void)
}
#ifdef CONFIG_MICROCODE
-int __init microcode_init(void);
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
#else
-static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 9257667d13c5..5d7494631ea9 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -6,6 +6,12 @@
#include <linux/rwsem.h>
#include <linux/mutex.h>
#include <linux/atomic.h>
+#include <linux/bits.h>
+
+/* Uprobes on this MM assume 32-bit code */
+#define MM_CONTEXT_UPROBE_IA32 BIT(0)
+/* vsyscall page is accessible on this MM */
+#define MM_CONTEXT_HAS_VSYSCALL BIT(1)
/*
* x86 has arch-specific MMU state beyond what lives in mm_struct.
@@ -33,8 +39,7 @@ typedef struct {
#endif
#ifdef CONFIG_X86_64
- /* True if mm supports a task running in 32 bit compatibility mode. */
- unsigned short ia32_compat;
+ unsigned short flags;
#endif
struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d98016b83755..27516046117a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -91,12 +91,14 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
}
#endif
+#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
*/
+#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
@@ -116,6 +118,8 @@ static inline int init_new_context(struct task_struct *tsk,
init_new_context_ldt(mm);
return 0;
}
+
+#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
@@ -177,7 +181,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
- !(mm->context.ia32_compat == TIF_IA32);
+ !(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
@@ -214,4 +218,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
unsigned long __get_current_cr3_fast(void);
+#include <asm-generic/mmu_context.h>
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ffc289992d1b..30f76b966857 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -74,6 +74,8 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {}
#if IS_ENABLED(CONFIG_HYPERV)
+extern int hyperv_init_cpuhp;
+
extern void *hv_hypercall_pg;
extern void __percpu **hyperv_pcpu_input_arg;
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index cd30013d15d3..b85147d75626 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -9,4 +9,54 @@ typedef struct irq_alloc_info msi_alloc_info_t;
int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg);
+/* Structs and defines for the X86 specific MSI message format */
+
+typedef struct x86_msi_data {
+ u32 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ reserved : 2,
+ active_low : 1,
+ is_level : 1;
+
+ u32 dmar_subhandle;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#define arch_msi_msg_data x86_msi_data
+
+typedef struct x86_msi_addr_lo {
+ union {
+ struct {
+ u32 reserved_0 : 2,
+ dest_mode_logical : 1,
+ redirect_hint : 1,
+ reserved_1 : 1,
+ virt_destid_8_14 : 7,
+ destid_0_7 : 8,
+ base_address : 12;
+ };
+ struct {
+ u32 dmar_reserved_0 : 2,
+ dmar_index_15 : 1,
+ dmar_subhandle_valid : 1,
+ dmar_format : 1,
+ dmar_index_0_14 : 15,
+ dmar_base_address : 12;
+ };
+ };
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#define arch_msi_msg_addr_lo x86_msi_addr_lo
+
+#define X86_MSI_BASE_ADDRESS_LOW (0xfee00000 >> 20)
+
+typedef struct x86_msi_addr_hi {
+ u32 reserved : 8,
+ destid_8_31 : 24;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#define arch_msi_msg_addr_hi x86_msi_addr_hi
+
+#define X86_MSI_BASE_ADDRESS_HIGH (0)
+
+struct msi_msg;
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
deleted file mode 100644
index ee2f8ccc32d0..000000000000
--- a/arch/x86/include/asm/msidef.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MSIDEF_H
-#define _ASM_X86_MSIDEF_H
-
-/*
- * Constants for Intel APIC based MSI messages.
- */
-
-/*
- * Shifts for MSI data
- */
-
-#define MSI_DATA_VECTOR_SHIFT 0
-#define MSI_DATA_VECTOR_MASK 0x000000ff
-#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
- MSI_DATA_VECTOR_MASK)
-
-#define MSI_DATA_DELIVERY_MODE_SHIFT 8
-#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
-#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
-
-#define MSI_DATA_LEVEL_SHIFT 14
-#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
-#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
-
-#define MSI_DATA_TRIGGER_SHIFT 15
-#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
-#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for msi address
- */
-
-#define MSI_ADDR_BASE_HI 0
-#define MSI_ADDR_BASE_LO 0xfee00000
-
-#define MSI_ADDR_DEST_MODE_SHIFT 2
-#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
-#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
-
-#define MSI_ADDR_REDIRECTION_SHIFT 3
-#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
- /* dedicated cpu */
-#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
- /* lowest priority */
-
-#define MSI_ADDR_DEST_ID_SHIFT 12
-#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
-#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
- MSI_ADDR_DEST_ID_MASK)
-#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
-
-#define MSI_ADDR_IR_EXT_INT (1 << 4)
-#define MSI_ADDR_IR_SHV (1 << 3)
-#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
-#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
-#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 972a34d93505..546d6ecf0a35 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -139,6 +139,7 @@
#define MSR_IA32_MCG_CAP 0x00000179
#define MSR_IA32_MCG_STATUS 0x0000017a
#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_ERROR_CONTROL 0x0000017f
#define MSR_IA32_MCG_EXT_CTL 0x000004d0
#define MSR_OFFCORE_RSP_0 0x000001a6
@@ -326,8 +327,9 @@
#define MSR_PP1_ENERGY_STATUS 0x00000641
#define MSR_PP1_POLICY 0x00000642
-#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b
#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299
+#define MSR_AMD_CORE_ENERGY_STATUS 0xc001029a
+#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b
/* Config TDP MSRs */
#define MSR_CONFIG_TDP_NOMINAL 0x00000648
@@ -470,6 +472,7 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
@@ -609,6 +612,8 @@
#define FEAT_CTL_LOCKED BIT(0)
#define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1)
#define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2)
+#define FEAT_CTL_SGX_LC_ENABLED BIT(17)
+#define FEAT_CTL_SGX_ENABLED BIT(18)
#define FEAT_CTL_LMCE_ENABLED BIT(20)
#define MSR_IA32_TSC_ADJUST 0x0000003b
@@ -628,6 +633,12 @@
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
+/* Intel SGX Launch Enclave Public Key Hash MSRs */
+#define MSR_IA32_SGXLEPUBKEYHASH0 0x0000008C
+#define MSR_IA32_SGXLEPUBKEYHASH1 0x0000008D
+#define MSR_IA32_SGXLEPUBKEYHASH2 0x0000008E
+#define MSR_IA32_SGXLEPUBKEYHASH3 0x0000008F
+
#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
#define MSR_IA32_SMBASE 0x0000009e
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 0b4920a7238e..e16cccdd0420 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -86,7 +86,7 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
* think of extending them - you will be slapped with a stinking trout or a frozen
* shark will reach you, wherever you are! You've been warned.
*/
-static inline unsigned long long notrace __rdmsr(unsigned int msr)
+static __always_inline unsigned long long __rdmsr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
@@ -98,7 +98,7 @@ static inline unsigned long long notrace __rdmsr(unsigned int msr)
return EAX_EDX_VAL(val, low, high);
}
-static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high)
+static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e039a933aca3..29dd27b5a339 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -88,8 +88,6 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
- trace_hardirqs_on();
-
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 9d5d949e662e..1cb9c17a4cb4 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -9,7 +9,6 @@
#ifdef CONFIG_X86_LOCAL_APIC
-extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f462895a33e4..faf9cc1c14bb 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -53,7 +53,13 @@
#define STACK_TOP_MAX STACK_TOP
/*
- * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual
+ * address for the kernel image, rather than the limit on the size itself. On
+ * 32-bit, this is not a strict limit, but this value is used to limit the
+ * link-time virtual address range of the kernel, and by KASLR to limit the
+ * randomized address from which the kernel is executed. A relocatable kernel
+ * can be loaded somewhat higher than KERNEL_IMAGE_SIZE as long as enough space
+ * remains for the vmalloc area.
*/
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 3f49dac03617..64297eabad63 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -66,7 +66,7 @@
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything executable
+ * We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
@@ -98,8 +98,10 @@
#define STACK_TOP_MAX TASK_SIZE_MAX
/*
- * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
- * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
+ * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual
+ * address for the kernel image, rather than the limit on the size itself.
+ * This can be at most 1 GiB, due to the fixmap living in the next 1 GiB (see
+ * level2_kernel_pgt in arch/x86/kernel/head_64.S).
*
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* page tables are fully set up.
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d25cc6830e89..4abf110e2243 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void)
return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
-static inline notrace void arch_local_irq_restore(unsigned long f)
-{
- PVOP_VCALLEE1(irq.restore_fl, f);
-}
-
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable);
@@ -776,31 +771,6 @@ extern void default_banner(void);
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
-/*
- * If swapgs is used while the userspace stack is still current,
- * there's no way to call a pvop. The PV replacement *must* be
- * inlined, or the swapgs instruction must be trapped and emulated.
- */
-#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
-
-/*
- * Note: swapgs is very special, and in practise is either going to be
- * implemented with a single "swapgs" instruction or something very
- * special. Either way, we don't need to save any registers for
- * it.
- */
-#define SWAPGS \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
- )
-
-#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
-
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
@@ -812,17 +782,6 @@ extern void default_banner(void);
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_PARAVIRT_XXL
-
-#define GET_CR2_INTO_AX \
- PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \
- )
-
-#endif /* CONFIG_PARAVIRT_XXL */
-
-
#endif /* __ASSEMBLY__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0fad9f61c76a..de87087d3bde 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -41,7 +41,6 @@
#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
-#include <asm/kmap_types.h>
#include <asm/pgtable_types.h>
#include <asm/nospec-branch.h>
@@ -157,20 +156,10 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /*
- * Switch to usermode gs and return to 64-bit usermode using
- * sysret. Only used in 64-bit kernels to return to 64-bit
- * processes. Usermode register state, including %rsp, must
- * already be restored.
- */
- void (*usergs_sysret64)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
- void (*swapgs)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -179,16 +168,13 @@ struct pv_cpu_ops {
struct pv_irq_ops {
#ifdef CONFIG_PARAVIRT_XXL
/*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
+ * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
+ * all other bits returned from save_fl are undefined.
*
* NOTE: These functions callers expect the callee to preserve
* more registers than the standard C calling convention.
*/
struct paravirt_callee_save save_fl;
- struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6960cd6d1f23..544f41a179fb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -137,7 +137,9 @@ union cpuid10_edx {
struct {
unsigned int num_counters_fixed:5;
unsigned int bit_width_fixed:8;
- unsigned int reserved:19;
+ unsigned int reserved1:2;
+ unsigned int anythread_deprecated:1;
+ unsigned int reserved2:16;
} split;
unsigned int full;
};
@@ -259,8 +261,12 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1)
#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2)
#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
INTEL_PMC_MSK_FIXED_SLOTS)
/*
@@ -278,8 +284,14 @@ struct x86_pmu_capability {
#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */
#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */
#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */
-#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM 4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM 8
static inline bool is_metric_idx(int idx)
{
@@ -481,11 +493,7 @@ static inline void perf_check_microcode(void) { }
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- *nr = 0;
- return NULL;
-}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
return -1;
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index d7acae4120d5..7c9c968a42ef 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -57,19 +57,13 @@ do { \
#endif
/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
+ * This is used to calculate the .brk reservation for initial pagetables.
+ * Enough space is reserved to allocate pagetables sufficient to cover all
+ * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
+ * lowmem.
*
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
+ * With PAE paging (PTRS_PER_PMD > 1), we allocate PTRS_PER_PGD == 4 pages for
+ * the PMD's in addition to the pages required for the last level pagetables.
*/
#if PTRS_PER_PMD > 1
#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f2240d..91ac10654570 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d;
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END _AC(0xffffffffff000000, UL)
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define MODULES_END _AC(0xffffffffff000000, UL)
+#else
+# define MODULES_END _AC(0xfffffffffe000000, UL)
+#endif
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 816b31c68550..f24d7ef8fffa 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -155,6 +155,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
+#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
@@ -176,8 +177,6 @@ enum page_cache_mode {
#define __pgprot(x) ((pgprot_t) { (x) } )
#define __pg(x) __pgprot(x)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-
#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 69485ca13665..f8cb8af4de5c 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
+#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
@@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset)
}
#ifdef CONFIG_PREEMPTION
- extern asmlinkage void preempt_schedule_thunk(void);
-# define __preempt_schedule() \
- asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
- extern asmlinkage void preempt_schedule(void);
- extern asmlinkage void preempt_schedule_notrace_thunk(void);
-# define __preempt_schedule_notrace() \
- asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
- extern asmlinkage void preempt_schedule_notrace(void);
-#endif
+#define __preempt_schedule_func preempt_schedule_thunk
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 82a08b585818..c20a52b5534b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -813,10 +813,8 @@ extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
#ifdef CONFIG_CPU_SUP_AMD
-extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
#else
-static inline u16 amd_get_nb_id(int cpu) { return 0; }
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
#endif
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 3ff0d48469f2..b2d504f11937 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -101,6 +101,7 @@
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define REQUIRED_MASK19 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 07603064df8f..d60ed0668a59 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
u32 rmid = state->default_rmid;
+ u32 tmp;
/*
* If this task has a closid/rmid assigned, use it.
* Else use the closid/rmid assigned to this cpu.
*/
if (static_branch_likely(&rdt_alloc_enable_key)) {
- if (current->closid)
- closid = current->closid;
+ tmp = READ_ONCE(current->closid);
+ if (tmp)
+ closid = tmp;
}
if (static_branch_likely(&rdt_mon_enable_key)) {
- if (current->rmid)
- rmid = current->rmid;
+ tmp = READ_ONCE(current->rmid);
+ if (tmp)
+ rmid = tmp;
}
if (closid != state->cur_closid || rmid != state->cur_rmid) {
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..fef16e398161 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -16,6 +16,26 @@
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
#endif
+#ifdef CONFIG_X86_64
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
+# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "ia32"
+# endif
+/*
+ * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
+ * caching them and they are treated as out of range syscalls, which will
+ * always pass through the BPF filter.
+ */
+#else /* !CONFIG_X86_64 */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ia32"
+#endif
+
#include <asm-generic/seccomp.h>
#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 5948218f35c5..4352f08bfbb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
extern int kernel_set_to_readonly;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 6bfc878f6771..6a9ccc1b2be5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -28,4 +28,14 @@
#endif
#endif /* CONFIG_SPARSEMEM */
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+extern int phys_to_target_node(phys_addr_t start);
+#define phys_to_target_node phys_to_target_node
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index cc177b4431ae..1d3cbaef4bb7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -243,10 +243,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void *dst, const void *src)
+static inline void movdir64b(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src)
static inline int enqcmds(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
int zf;
/*
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 49600643faba..f248eb2ac2d4 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -88,9 +88,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
return (unsigned long *)task->thread.sp;
}
-void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, const char *log_lvl);
-
/* The form of the top of the frame on the stack */
struct stack_frame {
struct stack_frame *next_frame;
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c..cbb67b6030f9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 71d630bb5e08..1c561945b426 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -98,6 +98,16 @@ enum {
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
INTERCEPT_RDPRU,
+ TRAP_EFER_WRITE,
+ TRAP_CR0_WRITE,
+ TRAP_CR1_WRITE,
+ TRAP_CR2_WRITE,
+ TRAP_CR3_WRITE,
+ TRAP_CR4_WRITE,
+ TRAP_CR5_WRITE,
+ TRAP_CR6_WRITE,
+ TRAP_CR7_WRITE,
+ TRAP_CR8_WRITE,
/* Byte offset 014h (word 5) */
INTERCEPT_INVLPGB = 160,
INTERCEPT_INVLPGB_ILLEGAL,
@@ -130,7 +140,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 exit_int_info_err;
u64 nested_ctl;
u64 avic_vapic_bar;
- u8 reserved_4[8];
+ u64 ghcb_gpa;
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
@@ -144,6 +154,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_6[8]; /* Offset 0xe8 */
u64 avic_logical_id; /* Offset 0xf0 */
u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[8];
+ u64 vmsa_pa; /* Used for an SEV-ES guest */
};
@@ -178,7 +190,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
-#define SVM_INTERRUPT_SHADOW_MASK 1
+#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0)
+#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1)
#define SVM_IOIO_STR_SHIFT 2
#define SVM_IOIO_REP_SHIFT 3
@@ -197,6 +210,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
struct vmcb_seg {
u16 selector;
@@ -220,7 +234,8 @@ struct vmcb_save_area {
u8 cpl;
u8 reserved_2[4];
u64 efer;
- u8 reserved_3[112];
+ u8 reserved_3[104];
+ u64 xss; /* Valid for SEV-ES only */
u64 cr4;
u64 cr3;
u64 cr0;
@@ -251,9 +266,12 @@ struct vmcb_save_area {
/*
* The following part of the save area is valid only for
- * SEV-ES guests when referenced through the GHCB.
+ * SEV-ES guests when referenced through the GHCB or for
+ * saving to the host save area.
*/
- u8 reserved_7[104];
+ u8 reserved_7[80];
+ u32 pkru;
+ u8 reserved_7a[20];
u64 reserved_8; /* rax already available at 0x01f8 */
u64 rcx;
u64 rdx;
@@ -294,7 +312,7 @@ struct ghcb {
#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
-#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
@@ -379,6 +397,16 @@ struct vmcb {
(unsigned long *)&ghcb->save.valid_bitmap); \
} \
\
+ static inline u64 ghcb_get_##field(struct ghcb *ghcb) \
+ { \
+ return ghcb->save.field; \
+ } \
+ \
+ static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
+ { \
+ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \
+ } \
+ \
static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
{ \
__set_bit(GHCB_BITMAP_IDX(field), \
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 0fd4a9dfb29c..ab7382f92aff 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
return;
+
/*
- * Return from interrupt and NMI is done through iret, which is core
- * serializing.
+ * Even if we're in an interrupt, we might reschedule before returning,
+ * in which case we could switch to a different thread in the same mm
+ * and return using SYSRET or SYSEXIT. Instead of trying to keep
+ * track of our need to sync the core, just sync right away.
*/
- if (in_irq() || in_nmi())
- return;
sync_core();
}
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 000000000000..ddbdefd5b94f
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 44733a4bfc42..0d751d5da702 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -55,6 +55,7 @@ struct task_struct;
struct thread_info {
unsigned long flags; /* low level flags */
+ unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
};
@@ -74,15 +75,11 @@ struct thread_info {
* - these are process state flags that various assembly files
* may need to access
*/
-#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Speculative store bypass disable */
-#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
-#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
-#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
@@ -91,7 +88,7 @@ struct thread_info {
#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
-#define TIF_IA32 17 /* IA32 compatibility process */
+#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
#define TIF_SLD 18 /* Restore split lock detection on context switch */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
@@ -99,19 +96,13 @@ struct thread_info {
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
-#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
-#define TIF_X32 30 /* 32-bit native x86-64 binary */
-#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_SSBD (1 << TIF_SSBD)
-#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
-#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
-#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
@@ -120,16 +111,14 @@ struct thread_info {
#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD)
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
-#define _TIF_IA32 (1 << TIF_IA32)
+#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_SLD (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
-#define _TIF_X32 (1 << TIF_X32)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 820082bd6880..1bfe979bb9bc 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,6 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f4234575f3fd..9239399e5491 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -110,6 +110,8 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+extern unsigned int __max_die_per_package;
+
#ifdef CONFIG_SMP
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
@@ -118,8 +120,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_logical_packages;
#define topology_max_packages() (__max_logical_packages)
-extern unsigned int __max_die_per_package;
-
static inline int topology_max_die_per_package(void)
{
return __max_die_per_package;
@@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled)
}
#endif
+#ifdef CONFIG_ACPI_CPPC_LIB
+void init_freq_invariance_cppc(void);
+#define init_freq_invariance_cppc init_freq_invariance_cppc
+#endif
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 305bc1214aef..10b1de500ab1 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 15 == 1: SGX MMU page-fault
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
@@ -19,6 +20,7 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SGX = 1 << 15,
};
#endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 08b3d810dfba..1b6455f881f9 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -28,6 +28,20 @@ enum uv_bios_cmd {
UV_BIOS_SET_LEGACY_VGA_TARGET
};
+#define UV_BIOS_EXTRA 0x10000
+#define UV_BIOS_GET_PCI_TOPOLOGY 0x10001
+#define UV_BIOS_GET_GEOINFO 0x10003
+
+#define UV_BIOS_EXTRA_OP_MEM_COPYIN 0x1000
+#define UV_BIOS_EXTRA_OP_MEM_COPYOUT 0x2000
+#define UV_BIOS_EXTRA_OP_MASK 0x0fff
+#define UV_BIOS_EXTRA_GET_HEAPSIZE 1
+#define UV_BIOS_EXTRA_INSTALL_HEAP 2
+#define UV_BIOS_EXTRA_MASTER_NASID 3
+#define UV_BIOS_EXTRA_OBJECT_COUNT (10|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+#define UV_BIOS_EXTRA_ENUM_OBJECTS (12|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+#define UV_BIOS_EXTRA_ENUM_PORTS (13|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+
/*
* Status values returned from a BIOS call.
*/
@@ -109,6 +123,32 @@ struct uv_systab {
} entry[1]; /* additional entries follow */
};
extern struct uv_systab *uv_systab;
+
+#define UV_BIOS_MAXSTRING 128
+struct uv_bios_hub_info {
+ unsigned int id;
+ union {
+ struct {
+ unsigned long long this_part:1;
+ unsigned long long is_shared:1;
+ unsigned long long is_disabled:1;
+ } fields;
+ struct {
+ unsigned long long flags;
+ unsigned long long reserved;
+ } b;
+ } f;
+ char name[UV_BIOS_MAXSTRING];
+ char location[UV_BIOS_MAXSTRING];
+ unsigned int ports;
+};
+
+struct uv_bios_port_info {
+ unsigned int port;
+ unsigned int conn_id;
+ unsigned int conn_port;
+};
+
/* (... end of definitions from UV BIOS ...) */
enum {
@@ -142,6 +182,15 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
+extern s64 uv_bios_get_master_nasid(u64 sz, u64 *nasid);
+extern s64 uv_bios_get_heapsize(u64 nasid, u64 sz, u64 *heap_sz);
+extern s64 uv_bios_install_heap(u64 nasid, u64 sz, u64 *heap);
+extern s64 uv_bios_obj_count(u64 nasid, u64 sz, u64 *objcnt);
+extern s64 uv_bios_enum_objs(u64 nasid, u64 sz, u64 *objbuf);
+extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 sz, u64 *portbuf);
+extern s64 uv_bios_get_geoinfo(u64 nasid, u64 sz, u64 *geo);
+extern s64 uv_bios_get_pci_topology(u64 sz, u64 *buf);
+
extern int uv_bios_init(void);
extern unsigned long get_uv_systab_phys(bool msg);
@@ -151,6 +200,8 @@ extern long sn_partition_id;
extern long sn_coherency_id;
extern long sn_region_size;
extern long system_serial_number;
+extern ssize_t uv_get_archtype(char *buf, int len);
+extern int uv_get_hubless_system(void);
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 172d3e4a9e4b..648eb23fe7f0 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -2,14 +2,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
-#include <asm/tlbflush.h>
-
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC};
-struct cpumask;
-struct mm_struct;
-struct flush_tlb_info;
-
#ifdef CONFIG_X86_UV
#include <linux/efi.h>
@@ -44,10 +38,6 @@ static inline int is_uv_system(void) { return 0; }
static inline int is_uv_hubbed(int uv) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h
new file mode 100644
index 000000000000..f241451035fb
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_geo.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2020 Hewlett Packard Enterprise Development LP. All rights reserved.
+ */
+
+#ifndef _ASM_UV_GEO_H
+#define _ASM_UV_GEO_H
+
+/* Type declaractions */
+
+/* Size of a geoid_s structure (must be before decl. of geoid_u) */
+#define GEOID_SIZE 8
+
+/* Fields common to all substructures */
+struct geo_common_s {
+ unsigned char type; /* What type of h/w is named by this geoid_s */
+ unsigned char blade;
+ unsigned char slot; /* slot is IRU */
+ unsigned char upos;
+ unsigned char rack;
+};
+
+/* Additional fields for particular types of hardware */
+struct geo_node_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_rtr_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_iocntl_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_pcicard_s {
+ struct geo_iocntl_s common;
+ char bus; /* Bus/widget number */
+ char slot; /* PCI slot number */
+};
+
+/* Subcomponents of a node */
+struct geo_cpu_s {
+ struct geo_node_s node;
+ unsigned char socket:4, /* Which CPU on the node */
+ thread:4;
+ unsigned char core;
+};
+
+struct geo_mem_s {
+ struct geo_node_s node;
+ char membus; /* The memory bus on the node */
+ char memslot; /* The memory slot on the bus */
+};
+
+union geoid_u {
+ struct geo_common_s common;
+ struct geo_node_s node;
+ struct geo_iocntl_s iocntl;
+ struct geo_pcicard_s pcicard;
+ struct geo_rtr_s rtr;
+ struct geo_cpu_s cpu;
+ struct geo_mem_s mem;
+ char padsize[GEOID_SIZE];
+};
+
+/* Defined constants */
+
+#define GEO_MAX_LEN 48
+
+#define GEO_TYPE_INVALID 0
+#define GEO_TYPE_MODULE 1
+#define GEO_TYPE_NODE 2
+#define GEO_TYPE_RTR 3
+#define GEO_TYPE_IOCNTL 4
+#define GEO_TYPE_IOCARD 5
+#define GEO_TYPE_CPU 6
+#define GEO_TYPE_MEM 7
+#define GEO_TYPE_MAX (GEO_TYPE_MEM+1)
+
+static inline int geo_rack(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.rack;
+}
+
+static inline int geo_slot(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.upos;
+}
+
+static inline int geo_blade(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.blade * 2 + g.common.slot;
+}
+
+#endif /* _ASM_UV_GEO_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index bbcdc7b8f963..98aa103eb4ab 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -15,6 +15,8 @@ struct vdso_image {
unsigned long size; /* Always a multiple of PAGE_SIZE */
unsigned long alt, alt_len;
+ unsigned long extable_base, extable_len;
+ const void *extable;
long sym_vvar_start; /* Negative offset to the vvar area */
@@ -27,6 +29,8 @@ struct vdso_image {
long sym___kernel_rt_sigreturn;
long sym___kernel_vsyscall;
long sym_int80_landing_pad;
+ long sym_vdso32_sigreturn_landing_pad;
+ long sym_vdso32_rt_sigreturn_landing_pad;
};
#ifdef CONFIG_X86_64
@@ -45,6 +49,9 @@ extern void __init init_vdso_image(const struct vdso_image *image);
extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr);
#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 26efbec94448..9e8ac5073ecb 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0;
unsigned long flags;
- unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f8ba5289ecb0..38ca445a8429 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -113,6 +113,7 @@
#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100
#define VMX_MISC_ZERO_LEN_INS 0x40000000
#define VMX_MISC_MSR_LIST_MULTIPLIER 512
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index dde5b3f1e7cd..5c69f7eb5d47 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -116,6 +116,7 @@ struct x86_init_pci {
* @init_platform: platform setup
* @guest_late_init: guest late init
* @x2apic_available: X2APIC detection
+ * @msi_ext_dest_id: MSI supports 15-bit APIC IDs
* @init_mem_mapping: setup early mappings during init_mem_mapping()
* @init_after_bootmem: guest init after boot allocator is finished
*/
@@ -123,6 +124,7 @@ struct x86_hyper_init {
void (*init_platform)(void);
void (*guest_late_init)(void);
bool (*x2apic_available)(void);
+ bool (*msi_ext_dest_id)(void);
void (*init_mem_mapping)(void);
void (*init_after_bootmem)(void);
};
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5941e18edd5a..1a162e559753 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -355,7 +355,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
-#define xen_remap(cookie, size) ioremap((cookie), (size));
+#define xen_remap(cookie, size) ioremap((cookie), (size))
#define xen_unmap(cookie) iounmap((cookie))
static inline bool xen_arch_need_swiotlb(struct device *dev,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..8e76d3701db3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
#define DE_VECTOR 0
#define DB_VECTOR 1
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 812e9b4c1114..950afebfba88 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -32,6 +32,7 @@
#define KVM_FEATURE_POLL_CONTROL 12
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
+#define KVM_FEATURE_MSI_EXT_DEST_ID 15
#define KVM_HINTS_REALTIME 0
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
new file mode 100644
index 000000000000..9034f3007c4e
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright(c) 2016-20 Intel Corporation.
+ */
+#ifndef _UAPI_ASM_X86_SGX_H
+#define _UAPI_ASM_X86_SGX_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/**
+ * enum sgx_page_flags - page control flags
+ * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of
+ * ENCLS[EEXTEND] operations.
+ */
+enum sgx_page_flags {
+ SGX_PAGE_MEASURE = 0x01,
+};
+
+#define SGX_MAGIC 0xA4
+
+#define SGX_IOC_ENCLAVE_CREATE \
+ _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
+#define SGX_IOC_ENCLAVE_ADD_PAGES \
+ _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
+#define SGX_IOC_ENCLAVE_INIT \
+ _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
+#define SGX_IOC_ENCLAVE_PROVISION \
+ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+
+/**
+ * struct sgx_enclave_create - parameter structure for the
+ * %SGX_IOC_ENCLAVE_CREATE ioctl
+ * @src: address for the SECS page data
+ */
+struct sgx_enclave_create {
+ __u64 src;
+};
+
+/**
+ * struct sgx_enclave_add_pages - parameter structure for the
+ * %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
+ * @src: start address for the page data
+ * @offset: starting page offset
+ * @length: length of the data (multiple of the page size)
+ * @secinfo: address for the SECINFO data
+ * @flags: page control flags
+ * @count: number of bytes added (multiple of the page size)
+ */
+struct sgx_enclave_add_pages {
+ __u64 src;
+ __u64 offset;
+ __u64 length;
+ __u64 secinfo;
+ __u64 flags;
+ __u64 count;
+};
+
+/**
+ * struct sgx_enclave_init - parameter structure for the
+ * %SGX_IOC_ENCLAVE_INIT ioctl
+ * @sigstruct: address for the SIGSTRUCT data
+ */
+struct sgx_enclave_init {
+ __u64 sigstruct;
+};
+
+/**
+ * struct sgx_enclave_provision - parameter structure for the
+ * %SGX_IOC_ENCLAVE_PROVISION ioctl
+ * @fd: file handle of /dev/sgx_provision
+ */
+struct sgx_enclave_provision {
+ __u64 fd;
+};
+
+struct sgx_enclave_run;
+
+/**
+ * typedef sgx_enclave_user_handler_t - Exit handler function accepted by
+ * __vdso_sgx_enter_enclave()
+ * @run: The run instance given by the caller
+ *
+ * The register parameters contain the snapshot of their values at enclave
+ * exit. An invalid ENCLU function number will cause -EINVAL to be returned
+ * to the caller.
+ *
+ * Return:
+ * - <= 0: The given value is returned back to the caller.
+ * - > 0: ENCLU function to invoke, either EENTER or ERESUME.
+ */
+typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx,
+ long rsp, long r8, long r9,
+ struct sgx_enclave_run *run);
+
+/**
+ * struct sgx_enclave_run - the execution context of __vdso_sgx_enter_enclave()
+ * @tcs: TCS used to enter the enclave
+ * @function: The last seen ENCLU function (EENTER, ERESUME or EEXIT)
+ * @exception_vector: The interrupt vector of the exception
+ * @exception_error_code: The exception error code pulled out of the stack
+ * @exception_addr: The address that triggered the exception
+ * @user_handler: User provided callback run on exception
+ * @user_data: Data passed to the user handler
+ * @reserved Reserved for future extensions
+ *
+ * If @user_handler is provided, the handler will be invoked on all return paths
+ * of the normal flow. The user handler may transfer control, e.g. via a
+ * longjmp() call or a C++ exception, without returning to
+ * __vdso_sgx_enter_enclave().
+ */
+struct sgx_enclave_run {
+ __u64 tcs;
+ __u32 function;
+ __u16 exception_vector;
+ __u16 exception_error_code;
+ __u64 exception_addr;
+ __u64 user_handler;
+ __u64 user_data;
+ __u8 reserved[216];
+};
+
+/**
+ * typedef vdso_sgx_enter_enclave_t - Prototype for __vdso_sgx_enter_enclave(),
+ * a vDSO function to enter an SGX enclave.
+ * @rdi: Pass-through value for RDI
+ * @rsi: Pass-through value for RSI
+ * @rdx: Pass-through value for RDX
+ * @function: ENCLU function, must be EENTER or ERESUME
+ * @r8: Pass-through value for R8
+ * @r9: Pass-through value for R9
+ * @run: struct sgx_enclave_run, must be non-NULL
+ *
+ * NOTE: __vdso_sgx_enter_enclave() does not ensure full compliance with the
+ * x86-64 ABI, e.g. doesn't handle XSAVE state. Except for non-volatile
+ * general purpose registers, EFLAGS.DF, and RSP alignment, preserving/setting
+ * state in accordance with the x86-64 ABI is the responsibility of the enclave
+ * and its runtime, i.e. __vdso_sgx_enter_enclave() cannot be called from C
+ * code without careful consideration by both the enclave and its runtime.
+ *
+ * All general purpose registers except RAX, RBX and RCX are passed as-is to the
+ * enclave. RAX, RBX and RCX are consumed by EENTER and ERESUME and are loaded
+ * with @function, asynchronous exit pointer, and @run.tcs respectively.
+ *
+ * RBP and the stack are used to anchor __vdso_sgx_enter_enclave() to the
+ * pre-enclave state, e.g. to retrieve @run.exception and @run.user_handler
+ * after an enclave exit. All other registers are available for use by the
+ * enclave and its runtime, e.g. an enclave can push additional data onto the
+ * stack (and modify RSP) to pass information to the optional user handler (see
+ * below).
+ *
+ * Most exceptions reported on ENCLU, including those that occur within the
+ * enclave, are fixed up and reported synchronously instead of being delivered
+ * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are
+ * never fixed up and are always delivered via standard signals. On synchrously
+ * reported exceptions, -EFAULT is returned and details about the exception are
+ * recorded in @run.exception, the optional sgx_enclave_exception struct.
+ *
+ * Return:
+ * - 0: ENCLU function was successfully executed.
+ * - -EINVAL: Invalid ENCL number (neither EENTER nor ERESUME).
+ */
+typedef int (*vdso_sgx_enter_enclave_t)(unsigned long rdi, unsigned long rsi,
+ unsigned long rdx, unsigned int function,
+ unsigned long r8, unsigned long r9,
+ struct sgx_enclave_run *run);
+
+#endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index e5745d593dc7..164a22a72984 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -62,30 +62,6 @@ typedef unsigned long sigset_t;
#define SIGRTMIN 32
#define SIGRTMAX _NSIG
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP 0x00000001u
-#define SA_NOCLDWAIT 0x00000002u
-#define SA_SIGINFO 0x00000004u
-#define SA_ONSTACK 0x08000000u
-#define SA_RESTART 0x10000000u
-#define SA_NODEFER 0x40000000u
-#define SA_RESETHAND 0x80000000u
-
-#define SA_NOMASK SA_NODEFER
-#define SA_ONESHOT SA_RESETHAND
-
#define SA_RESTORER 0x04000000
#define MINSIGSTKSZ 2048
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index f1d8307454e0..554f75fe013c 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -77,10 +77,28 @@
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_EFER_WRITE_TRAP 0x08f
+#define SVM_EXIT_CR0_WRITE_TRAP 0x090
+#define SVM_EXIT_CR1_WRITE_TRAP 0x091
+#define SVM_EXIT_CR2_WRITE_TRAP 0x092
+#define SVM_EXIT_CR3_WRITE_TRAP 0x093
+#define SVM_EXIT_CR4_WRITE_TRAP 0x094
+#define SVM_EXIT_CR5_WRITE_TRAP 0x095
+#define SVM_EXIT_CR6_WRITE_TRAP 0x096
+#define SVM_EXIT_CR7_WRITE_TRAP 0x097
+#define SVM_EXIT_CR8_WRITE_TRAP 0x098
+#define SVM_EXIT_CR9_WRITE_TRAP 0x099
+#define SVM_EXIT_CR10_WRITE_TRAP 0x09a
+#define SVM_EXIT_CR11_WRITE_TRAP 0x09b
+#define SVM_EXIT_CR12_WRITE_TRAP 0x09c
+#define SVM_EXIT_CR13_WRITE_TRAP 0x09d
+#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
+#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+#define SVM_EXIT_VMGEXIT 0x403
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
@@ -183,10 +201,20 @@
{ SVM_EXIT_MONITOR, "monitor" }, \
{ SVM_EXIT_MWAIT, "mwait" }, \
{ SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \
+ { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \
+ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
+ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
+ { SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
+ { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
+ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
+ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
+ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
{ SVM_EXIT_ERR, "invalid_guest_state" }
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
index d2ee4e307ef8..18909b8050bc 100644
--- a/arch/x86/include/uapi/asm/vm86.h
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -97,7 +97,7 @@ struct revectored_struct {
struct vm86_struct {
struct vm86_regs regs;
unsigned long flags;
- unsigned long screen_bitmap;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
@@ -106,7 +106,7 @@ struct vm86_struct {
/*
* flags masks
*/
-#define VM86_SCREEN_BITMAP 0x0001
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
struct vm86plus_info_struct {
unsigned long force_return_for_pic:1;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8ff9e8ac0d5..ada955c5ebb6 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -32,6 +32,7 @@
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
+#define EXIT_REASON_SIPI_SIGNAL 4
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
@@ -94,6 +95,7 @@
{ EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
{ EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
{ EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \
+ { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \
{ EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \
{ EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
{ EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..5eeb808eb024 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -161,5 +161,3 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif
-
-obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index c22fb55abcfd..0916f00a992e 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
apei_mce_report_mem_error(sev, mem_err);
#endif
}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ return apei_smca_report_x86_error(ctx_info, lapic_id);
+}
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dc..5d3a0b8fd379 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2400ad62f330..8d778e46725d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1374,7 +1374,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
- * @handler: address to jump to when the temporary breakpoint is hit
+ * @emulate: instruction to be emulated
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 18f6b7c4bd79..b4396952c9a6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -384,7 +384,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
int amd_get_subcaches(int cpu)
{
- struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+ struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link;
unsigned int mask;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
@@ -398,7 +398,7 @@ int amd_get_subcaches(int cpu)
int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
- struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu));
unsigned int reg;
int cuid;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b3eef1d5c903..7f4c081f59f0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -41,6 +41,7 @@
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <linux/atomic.h>
+#include <asm/barrier.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
#include <asm/proto.h>
@@ -94,6 +95,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
+ * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID
+ */
+static bool virt_ext_dest_id __ro_after_init;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -472,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
tsc = rdtsc();
wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
@@ -1591,7 +1600,7 @@ static void setup_local_APIC(void)
apic->init_apic_ldr();
#ifdef CONFIG_X86_32
- if (apic->dest_logical) {
+ if (apic->dest_mode_logical) {
int logical_apicid, ldr_apicid;
/*
@@ -1841,20 +1850,34 @@ static __init void try_to_enable_x2apic(int remap_mode)
return;
if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
+ u32 apic_limit = 255;
+
+ /*
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
*/
- if (max_physical_apicid > 255 ||
- !x86_init.hyper.x2apic_available()) {
+ if (!x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
}
/*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
+ * If the hypervisor supports extended destination ID in
+ * MSI, that increases the maximum APIC ID that can be
+ * used for non-remapped IRQ domains.
*/
+ if (x86_init.hyper.msi_ext_dest_id()) {
+ virt_ext_dest_id = 1;
+ apic_limit = 32767;
+ }
+
+ /*
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannnot
+ * be addressed must not be brought online.
+ */
+ x2apic_set_max_apicid(apic_limit);
x2apic_phys = 1;
}
x2apic_enable();
@@ -2478,6 +2501,46 @@ int hard_smp_processor_id(void)
return read_apic_id();
}
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar)
+{
+ memset(msg, 0, sizeof(*msg));
+
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+ msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
+
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+ msg->arch_data.vector = cfg->vector;
+
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+ /*
+ * Only the IOMMU itself can use the trick of putting destination
+ * APIC ID into the high bits of the address. Anything else would
+ * just be writing to memory if it tried that, and needs IR to
+ * address APICs which can't be addressed in the normal 32-bit
+ * address range at 0xFFExxxxx. That is typically just 8 bits, but
+ * some hypervisors allow the extended destination ID field in bits
+ * 5-11 to be used, giving support for 15 bits of APIC IDs in total.
+ */
+ if (dmar)
+ msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+ else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
+ msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
+ else
+ WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
+}
+
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+ u32 dest = msg->arch_addr_lo.destid_0_7;
+
+ if (extid)
+ dest |= msg->arch_addr_hi.destid_8_31 << 8;
+ return dest;
+}
+EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7862b152a052..8f72b4351c9f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector)
unsigned long flags;
local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
@@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = physflat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 780c702969b7..fe78319e0f7a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -95,19 +95,15 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
+ .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = noop_init_apic_ldr,
-
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = NULL,
-
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 35edd57f064a..a54d817eb4b6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = {
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = {
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 98d015a4405a..77555f66c14d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -127,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = bigsmp_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPU: */
- .irq_dest_mode = 0,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = bigsmp_check_apicid_used,
+ .check_apicid_used = bigsmp_check_apicid_used,
.init_apic_ldr = bigsmp_init_apic_ldr,
-
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.setup_apic_routing = bigsmp_setup_apic_routing,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7b3c7e0d4a09..e4ab4804b20d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <linux/msi.h>
#include <asm/irqdomain.h>
#include <asm/io.h>
@@ -63,7 +64,6 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
-
#include <asm/apic.h>
#define for_each_ioapic(idx) \
@@ -89,12 +89,12 @@ struct irq_pin_list {
};
struct mp_chip_data {
- struct list_head irq_2_pin;
- struct IO_APIC_route_entry entry;
- int trigger;
- int polarity;
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
u32 count;
- bool isa_irq;
};
struct mp_ioapic_gsi {
@@ -286,31 +286,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg,
writel(value, &io_apic->data);
}
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
+ struct IO_APIC_route_entry entry;
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- return eu.entry;
+ return entry;
}
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
+ struct IO_APIC_route_entry entry;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- eu.entry = __ioapic_read_entry(apic, pin);
+ entry = __ioapic_read_entry(apic, pin);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
+ return entry;
}
/*
@@ -321,11 +316,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
*/
static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu = {{0, 0}};
-
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
}
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
@@ -344,12 +336,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
*/
static void ioapic_mask_entry(int apic, int pin)
{
+ struct IO_APIC_route_entry e = { .masked = true };
unsigned long flags;
- union entry_union eu = { .entry.mask = IOAPIC_MASKED };
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -422,20 +414,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
add_pin_to_irq_node(data, node, newapic, newpin);
}
-static void io_apic_modify_irq(struct mp_chip_data *data,
- int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
void (*final)(struct irq_pin_list *entry))
{
- union entry_union eu;
struct irq_pin_list *entry;
- eu.entry = data->entry;
- eu.w1 &= mask_and;
- eu.w1 |= mask_or;
- data->entry = eu.entry;
+ data->entry.masked = masked;
for_each_irq_pin(entry, data->irq_2_pin) {
- io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
if (final)
final(entry);
}
@@ -459,13 +446,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data)
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(data, true, &io_apic_sync);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void __unmask_ioapic(struct mp_chip_data *data)
{
- io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(data, false, NULL);
}
static void unmask_ioapic_irq(struct irq_data *irq_data)
@@ -506,8 +493,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
/*
* Mask the entry and change the trigger mode to edge.
*/
- entry1.mask = IOAPIC_MASKED;
- entry1.trigger = IOAPIC_EDGE;
+ entry1.masked = true;
+ entry1.is_level = false;
__ioapic_write_entry(apic, pin, entry1);
@@ -535,15 +522,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
+ if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
return;
/*
* Make sure the entry is masked and re-read the contents to check
* if it is a level triggered pin and if the remote-IRR is set.
*/
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
entry = ioapic_read_entry(apic, pin);
}
@@ -556,8 +543,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
* doesn't clear the remote-IRR if the trigger mode is not
* set to level.
*/
- if (entry.trigger == IOAPIC_EDGE) {
- entry.trigger = IOAPIC_LEVEL;
+ if (!entry.is_level) {
+ entry.is_level = true;
ioapic_write_entry(apic, pin, entry);
}
raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -659,8 +646,8 @@ void mask_ioapic_entries(void)
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -745,44 +732,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (IOAPIC_EDGE)
-#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
-#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -791,90 +741,139 @@ static int irq_polarity(int idx)
*/
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
- /* conforms to spec, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- return default_ISA_polarity(idx);
- else
- return default_PCI_polarity(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
case MP_IRQPOL_ACTIVE_HIGH:
- return IOAPIC_POL_HIGH;
+ return false;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
fallthrough;
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_POL_LOW;
+ return true;
}
}
#ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_PCI:
case MP_BUS_ISA:
- return trigger;
+ return level;
case MP_BUS_EISA:
- return default_EISA_trigger(idx);
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
}
pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
- return IOAPIC_LEVEL;
+ return true;
}
#else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return trigger;
+ return level;
}
#endif
-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int trigger;
+ bool level;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
- /* conforms to spec, ie. bus-type dependent trigger mode */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to level, ISA to edge.
+ */
+ level = !test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
- return eisa_irq_trigger(idx, bus, trigger);
+ return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:
- return IOAPIC_EDGE;
+ return false;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
fallthrough;
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_LEVEL;
+ return true;
}
}
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
+}
+#endif
+
void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
int trigger, int polarity)
{
init_irq_alloc_info(info, NULL);
info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
info->ioapic.node = node;
- info->ioapic.trigger = trigger;
- info->ioapic.polarity = polarity;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
info->ioapic.valid = 1;
}
-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
struct irq_alloc_info *src,
u32 gsi, int ioapic_idx, int pin)
{
- int trigger, polarity;
+ bool level, pol_low;
copy_irq_alloc_info(dst, src);
dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
@@ -883,20 +882,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
dst->ioapic.valid = 1;
if (src && src->ioapic.valid) {
dst->ioapic.node = src->ioapic.node;
- dst->ioapic.trigger = src->ioapic.trigger;
- dst->ioapic.polarity = src->ioapic.polarity;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
} else {
dst->ioapic.node = NUMA_NO_NODE;
- if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
- dst->ioapic.trigger = trigger;
- dst->ioapic.polarity = polarity;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
} else {
/*
* PCI interrupts are always active low level
* triggered.
*/
- dst->ioapic.trigger = IOAPIC_LEVEL;
- dst->ioapic.polarity = IOAPIC_POL_LOW;
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
}
}
@@ -906,12 +905,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}
-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
{
irq_flow_handler_t hdl;
bool fasteoi;
- if (trigger) {
+ if (level) {
irq_set_status_flags(irq, IRQ_LEVEL);
fasteoi = true;
} else {
@@ -933,14 +932,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
- if (info->ioapic.trigger != data->trigger)
- mp_register_handler(irq, info->ioapic.trigger);
- data->entry.trigger = data->trigger = info->ioapic.trigger;
- data->entry.polarity = data->polarity = info->ioapic.polarity;
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.is_level = data->is_level = info->ioapic.is_level;
+ data->entry.active_low = data->active_low = info->ioapic.active_low;
}
- return data->trigger == info->ioapic.trigger &&
- data->polarity == info->ioapic.polarity;
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}
static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -1219,10 +1218,9 @@ void ioapic_zap_locks(void)
static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
- int i;
- char buf[256];
struct IO_APIC_route_entry entry;
- struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+ char buf[256];
+ int i;
printk(KERN_DEBUG "IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
@@ -1230,20 +1228,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
snprintf(buf, sizeof(buf),
" pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
i,
- entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
- entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
- entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.masked ? "disabled" : "enabled ",
+ entry.is_level ? "level" : "edge ",
+ entry.active_low ? "low " : "high",
entry.vector, entry.irr, entry.delivery_status);
- if (ir_entry->format)
+ if (entry.ir_format) {
printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf, (ir_entry->index2 << 15) | ir_entry->index,
- ir_entry->zero);
- else
- printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
buf,
- entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
- "logical " : "physical",
- entry.dest, entry.delivery_mode);
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14,
+ entry.ir_zero);
+ } else {
+ printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7,
+ entry.delivery_mode);
+ }
}
}
@@ -1368,7 +1367,8 @@ void __init enable_IO_APIC(void)
/* If the interrupt line is enabled and in ExtInt mode
* I have found the pin where the i8259 is connected.
*/
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ if (!entry.masked &&
+ entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
goto found_i8259;
@@ -1410,14 +1410,16 @@ void native_restore_boot_irq_mode(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
+ u32 apic_id = read_apic_id();
memset(&entry, 0, sizeof(entry));
- entry.mask = IOAPIC_UNMASKED;
- entry.trigger = IOAPIC_EDGE;
- entry.polarity = IOAPIC_POL_HIGH;
- entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry.delivery_mode = dest_ExtINT;
- entry.dest = read_apic_id();
+ entry.masked = false;
+ entry.is_level = false;
+ entry.active_low = false;
+ entry.dest_mode_logical = false;
+ entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry.destid_0_7 = apic_id & 0xFF;
+ entry.virt_destid_8_14 = apic_id >> 8;
/*
* Add it to the IO-APIC irq-routing table:
@@ -1618,21 +1620,16 @@ static void __init delay_without_tsc(void)
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
- unsigned long flags;
if (no_timer_check)
return 1;
- local_save_flags(flags);
local_irq_enable();
-
if (boot_cpu_has(X86_FEATURE_TSC))
delay_with_tsc();
else
delay_without_tsc();
- local_irq_restore(flags);
-
/*
* Expect a few ticks at least, to be sure some possible
* glue logic does not lock up after one or two first
@@ -1641,10 +1638,10 @@ static int __init timer_irq_works(void)
* least one tick may be lost due to delays.
*/
- /* jiffies wrap? */
- if (time_after(jiffies, t1 + 4))
- return 1;
- return 0;
+ local_irq_disable();
+
+ /* Did jiffies advance? */
+ return time_after(jiffies, t1 + 4);
}
/*
@@ -1696,13 +1693,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, data->irq_2_pin) {
- unsigned int reg;
+ struct IO_APIC_route_entry e;
int pin;
pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ if (e.irr) {
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
@@ -1849,21 +1846,62 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(data->entry.vector, data);
}
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *entry)
+{
+ struct msi_msg msg;
+
+ /* Let the parent domain compose the MSI message */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * - Real vector
+ * - DMAR/IR: 8bit subhandle (ioapic.pin)
+ * - AMD/IR: 8bit IRTE index
+ */
+ entry->vector = msg.arch_data.vector;
+ /* Delivery mode (for DMAR/IR all 0) */
+ entry->delivery_mode = msg.arch_data.delivery_mode;
+ /* Destination mode or DMAR/IR index bit 15 */
+ entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical;
+ /* DMAR/IR: 1, 0 for all other modes */
+ entry->ir_format = msg.arch_addr_lo.dmar_format;
+ /*
+ * - DMAR/IR: index bit 0-14.
+ *
+ * - Virt: If the host supports x2apic without a virtualized IR
+ * unit then bit 0-6 of dmar_index_0_14 are providing bit
+ * 8-14 of the destination id.
+ *
+ * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+ * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+ */
+ entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14;
+}
+
static void ioapic_configure_entry(struct irq_data *irqd)
{
struct mp_chip_data *mpd = irqd->chip_data;
- struct irq_cfg *cfg = irqd_cfg(irqd);
struct irq_pin_list *entry;
- /*
- * Only update when the parent is the vector domain, don't touch it
- * if the parent is the remapping domain. Check the installed
- * ioapic chip to verify that.
- */
- if (irqd->chip == &ioapic_chip) {
- mpd->entry.dest = cfg->dest_apicid;
- mpd->entry.vector = cfg->vector;
- }
+ ioapic_setup_msg_from_msi(irqd, &mpd->entry);
+
for_each_irq_pin(entry, mpd->irq_2_pin)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
@@ -1919,7 +1957,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
* irrelevant because the IO-APIC treats them as fire and
* forget.
*/
- if (rentry.irr && rentry.trigger) {
+ if (rentry.irr && rentry.is_level) {
*state = true;
break;
}
@@ -2027,6 +2065,7 @@ static inline void __init unlock_ExtINT_logic(void)
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
@@ -2042,14 +2081,16 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
+ apic_id = hard_smp_processor_id();
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry1.mask = IOAPIC_UNMASKED;
- entry1.dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = IOAPIC_EDGE;
+ entry1.dest_mode_logical = true;
+ entry1.masked = false;
+ entry1.destid_0_7 = apic_id & 0xFF;
+ entry1.virt_destid_8_14 = apic_id >> 8;
+ entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry1.active_low = entry0.active_low;
+ entry1.is_level = false;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2117,13 +2158,12 @@ static inline void __init check_timer(void)
struct irq_cfg *cfg = irqd_cfg(irq_data);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
- unsigned long flags;
int no_pin1 = 0;
if (!global_clock_event)
return;
- local_irq_save(flags);
+ local_irq_disable();
/*
* get/set the timer IRQ vector:
@@ -2178,9 +2218,9 @@ static inline void __init check_timer(void)
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
@@ -2191,7 +2231,6 @@ static inline void __init check_timer(void)
goto out;
}
panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
- local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2215,7 +2254,6 @@ static inline void __init check_timer(void)
/*
* Cleanup, just in case ...
*/
- local_irq_disable();
legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -2232,7 +2270,6 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- local_irq_disable();
legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -2251,7 +2288,6 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
if (apic_is_x2apic_enabled())
apic_printk(APIC_QUIET, KERN_INFO
@@ -2260,7 +2296,7 @@ static inline void __init check_timer(void)
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
- local_irq_restore(flags);
+ local_irq_enable();
}
/*
@@ -2284,36 +2320,37 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- struct irq_alloc_info info;
struct irq_domain *parent;
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
struct fwnode_handle *fn;
- char *name = "IO-APIC";
+ struct irq_fwspec fwspec;
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
- info.devid = mpc_ioapic_id(ioapic);
- parent = irq_remapping_get_irq_domain(&info);
- if (!parent)
- parent = x86_vector_domain;
- else
- name = "IO-APIC-IR";
-
/* Handle device tree enumerated APICs proper */
if (cfg->dev) {
fn = of_node_to_fwnode(cfg->dev);
} else {
- fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
if (!fn)
return -ENOMEM;
}
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = mpc_ioapic_id(ioapic);
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ if (!parent) {
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENODEV;
+ }
+
ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
(void *)(long)ioapic);
@@ -2587,30 +2624,6 @@ static int io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
- int ioapic, pin, idx;
-
- if (skip_ioapic_setup)
- return -1;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
-
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
-
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
-}
-
/*
* This function updates target affinity of IOAPIC interrupts to include
* the CPUs which came online during SMP bringup.
@@ -2934,44 +2947,49 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
struct irq_alloc_info *info)
{
if (info && info->ioapic.valid) {
- data->trigger = info->ioapic.trigger;
- data->polarity = info->ioapic.polarity;
- } else if (acpi_get_override_irq(gsi, &data->trigger,
- &data->polarity) < 0) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level,
+ &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
- data->trigger = IOAPIC_LEVEL;
- data->polarity = IOAPIC_POL_LOW;
+ data->is_level = true;
+ data->active_low = true;
}
}
-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
- struct IO_APIC_route_entry *entry)
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
{
+ struct IO_APIC_route_entry *entry = &data->entry;
+
memset(entry, 0, sizeof(*entry));
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = cfg->dest_apicid;
- entry->vector = cfg->vector;
- entry->trigger = data->trigger;
- entry->polarity = data->polarity;
+ entry->is_level = data->is_level;
+ entry->active_low = data->active_low;
/*
* Mask level triggered irqs. Edge triggered irqs are masked
* by the irq core code in case they fire.
*/
- if (data->trigger == IOAPIC_LEVEL)
- entry->mask = IOAPIC_MASKED;
- else
- entry->mask = IOAPIC_UNMASKED;
+ entry->masked = data->is_level;
}
int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
- int ret, ioapic, pin;
- struct irq_cfg *cfg;
- struct irq_data *irq_data;
- struct mp_chip_data *data;
struct irq_alloc_info *info = arg;
+ struct mp_chip_data *data;
+ struct irq_data *irq_data;
+ int ret, ioapic, pin;
unsigned long flags;
if (!info || nr_irqs > 1)
@@ -2989,7 +3007,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
if (!data)
return -ENOMEM;
- info->ioapic.entry = &data->entry;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
if (ret < 0) {
kfree(data);
@@ -3003,22 +3020,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- cfg = irqd_cfg(irq_data);
add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ mp_preconfigure_entry(data);
+ mp_register_handler(virq, data->is_level);
+
local_irq_save(flags);
- if (info->ioapic.entry)
- mp_setup_entry(cfg, data, info->ioapic.entry);
- mp_register_handler(virq, data->trigger);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
local_irq_restore(flags);
apic_printk(APIC_VERBOSE, KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
- ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
- virq, data->trigger, data->polarity, cfg->dest_apicid);
-
+ "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq,
+ data->is_level, data->active_low);
return 0;
}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 387154e39e08..d1fb874fbe64 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
for_each_cpu(query_cpu, mask)
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
@@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
continue;
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ vector, APIC_DEST_LOGICAL);
}
local_irq_restore(flags);
}
@@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6313f0a05db7..44ebe25e7703 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -15,7 +15,6 @@
#include <linux/hpet.h>
#include <linux/msi.h>
#include <asm/irqdomain.h>
-#include <asm/msidef.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
@@ -23,38 +22,11 @@
struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
-static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
-{
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- if (x2apic_enabled())
- msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL :
- MSI_ADDR_DEST_MODE_LOGICAL) |
- MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DEST_ID(cfg->dest_apicid);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- MSI_DATA_DELIVERY_FIXED |
- MSI_DATA_VECTOR(cfg->vector);
-}
-
-void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
-{
- __irq_msi_compose_msg(irqd_cfg(data), msg);
-}
-
static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
{
struct msi_msg msg[2] = { [1] = { }, };
- __irq_msi_compose_msg(cfg, msg);
+ __irq_msi_compose_msg(cfg, msg, false);
irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
}
@@ -276,6 +248,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
#endif
#ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
dmar_msi_write(data->irq, msg);
@@ -288,6 +271,7 @@ static struct irq_chip dmar_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -356,114 +340,3 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
}
#endif
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_HPET_TIMER
-static inline int hpet_dev_id(struct irq_domain *domain)
-{
- struct msi_domain_info *info = msi_get_domain_info(domain);
-
- return (int)(long)info->data;
-}
-
-static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
-{
- hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
-}
-
-static struct irq_chip hpet_msi_controller __ro_after_init = {
- .name = "HPET-MSI",
- .irq_unmask = hpet_msi_unmask,
- .irq_mask = hpet_msi_mask,
- .irq_ack = irq_chip_ack_parent,
- .irq_set_affinity = msi_domain_set_affinity,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_write_msi_msg = hpet_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static int hpet_msi_init(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq,
- irq_hw_number_t hwirq, msi_alloc_info_t *arg)
-{
- irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
- irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
- handle_edge_irq, arg->data, "edge");
-
- return 0;
-}
-
-static void hpet_msi_free(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq)
-{
- irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
-}
-
-static struct msi_domain_ops hpet_msi_domain_ops = {
- .msi_init = hpet_msi_init,
- .msi_free = hpet_msi_free,
-};
-
-static struct msi_domain_info hpet_msi_domain_info = {
- .ops = &hpet_msi_domain_ops,
- .chip = &hpet_msi_controller,
- .flags = MSI_FLAG_USE_DEF_DOM_OPS,
-};
-
-struct irq_domain *hpet_create_irq_domain(int hpet_id)
-{
- struct msi_domain_info *domain_info;
- struct irq_domain *parent, *d;
- struct irq_alloc_info info;
- struct fwnode_handle *fn;
-
- if (x86_vector_domain == NULL)
- return NULL;
-
- domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
- if (!domain_info)
- return NULL;
-
- *domain_info = hpet_msi_domain_info;
- domain_info->data = (void *)(long)hpet_id;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
- info.devid = hpet_id;
- parent = irq_remapping_get_irq_domain(&info);
- if (parent == NULL)
- parent = x86_vector_domain;
- else
- hpet_msi_controller.name = "IR-HPET-MSI";
-
- fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
- hpet_id);
- if (!fn) {
- kfree(domain_info);
- return NULL;
- }
-
- d = msi_create_irq_domain(fn, domain_info, parent);
- if (!d) {
- irq_domain_free_fwnode(fn);
- kfree(domain_info);
- }
- return d;
-}
-
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
- int dev_num)
-{
- struct irq_alloc_info info;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET;
- info.data = hc;
- info.devid = hpet_dev_id(domain);
- info.hwirq = dev_num;
-
- return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
-}
-#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 67b6f7c049ec..a61f642b1b90 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -69,16 +69,13 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
+ .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = default_init_apic_ldr,
-
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = setup_apic_flat_routing,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 1eac53632786..3c9c7492252f 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
int node = irq_data_get_node(irqd);
- if (node == NUMA_NO_NODE)
- goto all;
- /* Try the intersection of @affmsk and node mask */
- cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
- if (!assign_vector_locked(irqd, vector_searchmask))
- return 0;
- /* Try the node mask */
- if (!assign_vector_locked(irqd, cpumask_of_node(node)))
- return 0;
-all:
+ if (node != NUMA_NO_NODE) {
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ }
+
/* Try the full affinity mask */
cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
if (!assign_vector_locked(irqd, vector_searchmask))
return 0;
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+ }
+
/* Try the full online mask */
return assign_vector_locked(irqd, cpu_online_mask);
}
@@ -636,7 +640,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
}
#endif
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+ simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+ }
+ return to_of_node(fwspec->fwnode) &&
+ of_device_is_compatible(to_of_node(fwspec->fwnode),
+ "intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+ simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+ }
+ return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /*
+ * HPET and I/OAPIC cannot be parented in the vector domain
+ * if IRQ remapping is enabled. APIC IDs above 15 bits are
+ * only permitted if IRQ remapping is enabled, so check that.
+ */
+ if (apic->apic_id_valid(32768))
+ return 0;
+
+ return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
static const struct irq_domain_ops x86_vector_domain_ops = {
+ .select = x86_vector_select,
.alloc = x86_vector_alloc_irqs,
.free = x86_vector_free_irqs,
.activate = x86_vector_activate,
@@ -818,6 +865,12 @@ void apic_ack_edge(struct irq_data *irqd)
apic_ack_irq(irqd);
}
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
static struct irq_chip lapic_controller = {
.name = "APIC",
.irq_ack = apic_ack_edge,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b0889c48a2ac..f4da9bb69a88 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
@@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long flags;
u32 dest;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
@@ -61,7 +63,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
if (!dest)
continue;
- __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
/* Remove cluster CPUs from tmpmask */
cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
}
@@ -184,15 +186,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index bc9693841353..6bde05a86b4e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -8,6 +8,12 @@
int x2apic_phys;
static struct apic apic_x2apic_phys;
+static u32 x2apic_max_apicid __ro_after_init;
+
+void __init x2apic_set_max_apicid(u32 apicid)
+{
+ x2apic_max_apicid = apicid;
+}
static int __init set_x2apic_phys_mode(char *arg)
{
@@ -37,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
@@ -48,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long this_cpu;
unsigned long flags;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
@@ -98,6 +106,9 @@ static int x2apic_phys_probe(void)
/* Common x2apic functions, also used by x2apic_cluster */
int x2apic_apic_id_valid(u32 apicid)
{
+ if (x2apic_max_apicid && apicid > x2apic_max_apicid)
+ return 0;
+
return 1;
}
@@ -116,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
unsigned long cfg = __prepare_ICR(which, vector, 0);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
native_x2apic_icr_write(cfg, 0);
}
@@ -148,15 +160,13 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 714233cee0b5..52bc217ca8c3 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -33,7 +33,7 @@ static union uvh_apicid uvh_apicid;
static int uv_node_id;
/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
-static u8 uv_archtype[UV_AT_SIZE];
+static u8 uv_archtype[UV_AT_SIZE + 1];
static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
@@ -161,7 +161,7 @@ static int __init early_set_hub_type(void)
/* UV4/4A only have a revision difference */
case UV4_HUB_PART_NUMBER:
uv_min_hub_revision_id = node_id.s.revision
- + UV4_HUB_REVISION_BASE;
+ + UV4_HUB_REVISION_BASE - 1;
uv_hub_type_set(UV4);
if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE)
uv_hub_type_set(UV4|UV4A);
@@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from)
{
/* Relies on 'to' being NULL chars so result will be NULL terminated */
strncpy(to, from, len-1);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
}
/* Find UV arch type entry in UVsystab */
@@ -317,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr)
if (n > 0 && n < sizeof(uv_ate->archtype)) {
pr_info("UV: UVarchtype received from BIOS\n");
- uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
return 1;
}
return 0;
@@ -366,7 +369,7 @@ static int __init early_get_arch_type(void)
return ret;
}
-static int __init uv_set_system_type(char *_oem_id)
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
{
/* Save OEM_ID passed from ACPI MADT */
uv_stringify(sizeof(oem_id), oem_id, _oem_id);
@@ -375,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id)
if (!early_get_arch_type())
/* If not use OEM ID for UVarchtype */
- uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
/* Check if not hubbed */
if (strncmp(uv_archtype, "SGI", 3) != 0) {
@@ -386,13 +389,23 @@ static int __init uv_set_system_type(char *_oem_id)
/* (Not hubless), not a UV */
return 0;
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
/* UV4 Hubless: CH */
- if (strncmp(uv_archtype, "NSGI4", 5) == 0)
- uv_hubless_system = 0x11;
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
/* UV3 Hubless: UV300/MC990X w/o hub */
else
- uv_hubless_system = 0x9;
+ uv_hubless_system |= 0x8;
+
+ /* Copy APIC type */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
oem_id, oem_table_id, uv_system_type, uv_hubless_system);
@@ -456,7 +469,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
/* If not UV, return. */
- if (likely(uv_set_system_type(_oem_id) == 0))
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
return 0;
/* Save and Decode OEM Table ID */
@@ -489,6 +502,18 @@ enum uv_system_type get_uv_system_type(void)
return uv_system_type;
}
+int uv_get_hubless_system(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(uv_get_hubless_system);
+
+ssize_t uv_get_archtype(char *buf, int len)
+{
+ return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id);
+}
+EXPORT_SYMBOL_GPL(uv_get_archtype);
+
int is_uv_system(void)
{
return uv_system_type != UV_NONE;
@@ -703,9 +728,9 @@ static void uv_send_IPI_one(int cpu, int vector)
unsigned long dmode, val;
if (vector == NMI_VECTOR)
- dmode = dest_NMI;
+ dmode = APIC_DELIVERY_MODE_NMI;
else
- dmode = dest_Fixed;
+ dmode = APIC_DELIVERY_MODE_FIXED;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(apicid << UVH_IPI_INT_APIC_ID_SHFT) |
@@ -807,15 +832,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.apic_id_valid = uv_apic_id_valid,
.apic_id_registered = uv_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* Physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = uv_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -1590,21 +1613,30 @@ static void check_efi_reboot(void)
reboot_type = BOOT_ACPI;
}
-/* Setup user proc fs files */
+/*
+ * User proc fs file handling now deprecated.
+ * Recommend using /sys/firmware/sgi_uv/... instead.
+ */
static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n",
+ current->comm);
seq_printf(file, "0x%x\n", uv_hubbed_system);
return 0;
}
static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n",
+ current->comm);
seq_printf(file, "0x%x\n", uv_hubless_system);
return 0;
}
static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n",
+ current->comm);
seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id);
return 0;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 70b7154f4bdd..60b9f42ce3c1 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -66,7 +66,6 @@ static void __used common(void)
OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
- OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
#endif
#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 828be792231e..b14533af7676 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
- OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
- cpu.usergs_sysret64);
- OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 93792b457b81..637b499450d1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_SGX) += sgx/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6062ce586b95..347a956f71ca 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -23,7 +23,6 @@
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
-# include <asm/set_memory.h>
#endif
#include "cpu.h"
@@ -330,7 +329,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
*/
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
@@ -340,7 +338,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- node_id = ecx & 0xff;
+ c->cpu_die_id = ecx & 0xff;
if (c->x86 == 0x15)
c->cu_id = ebx & 0xff;
@@ -360,15 +358,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
if (!err)
c->x86_coreid_bits = get_count_order(c->x86_max_cores);
- cacheinfo_amd_init_llc_id(c, cpu, node_id);
+ cacheinfo_amd_init_llc_id(c, cpu);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- node_id = value & 7;
+ c->cpu_die_id = value & 7;
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else
return;
@@ -393,7 +391,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
static void amd_detect_ppin(struct cpuinfo_x86 *c)
@@ -425,12 +423,6 @@ clear_ppin:
clear_cpu_cap(c, X86_FEATURE_AMD_PPIN);
}
-u16 amd_get_nb_id(int cpu)
-{
- return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(amd_get_nb_id);
-
u32 amd_get_nodes_per_socket(void)
{
return nodes_per_socket;
@@ -516,26 +508,6 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
-
-#ifdef CONFIG_X86_64
- if (c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- pr_debug("tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
@@ -570,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
u32 ecx;
ecx = cpuid_ecx(0x8000001e);
- nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes_per_socket = ((value >> 3) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
}
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index e2f319dc992d..22911deacb6e 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,11 +14,13 @@
#include <linux/cpufreq.h>
#include <linux/smp.h>
#include <linux/sched/isolation.h>
+#include <linux/rcupdate.h>
#include "cpu.h"
struct aperfmperf_sample {
unsigned int khz;
+ atomic_t scfpending;
ktime_t time;
u64 aperf;
u64 mperf;
@@ -62,17 +64,20 @@ static void aperfmperf_snapshot_khz(void *dummy)
s->aperf = aperf;
s->mperf = mperf;
s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ atomic_set_release(&s->scfpending, 0);
}
static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
{
s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+ struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
/* Don't bother re-computing within the cache threshold time. */
if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return true;
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (!atomic_xchg(&s->scfpending, 1) || wait)
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
/* Return false if the previous iteration was too long ago. */
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
@@ -89,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu)
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
+ if (rcu_is_idle_cpu(cpu))
+ return 0; /* Idle CPUs are completely uninteresting. */
+
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
}
@@ -108,6 +116,8 @@ void arch_freq_prepare_all(void)
for_each_online_cpu(cpu) {
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
continue;
+ if (rcu_is_idle_cpu(cpu))
+ continue; /* Idle CPUs are completely uninteresting. */
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
}
@@ -118,6 +128,8 @@ void arch_freq_prepare_all(void)
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+
if (!cpu_khz)
return 0;
@@ -131,6 +143,8 @@ unsigned int arch_freq_get_on_cpu(int cpu)
return per_cpu(samples.khz, cpu);
msleep(APERFMPERF_REFRESH_DELAY_MS);
+ atomic_set(&s->scfpending, 1);
+ smp_mb(); /* ->scfpending before smp_call_function_single(). */
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
return per_cpu(samples.khz, cpu);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d3f0db463f96..d41b70fe4918 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -739,11 +739,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ spectre_v2_user_ibpb = mode;
switch (cmd) {
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
static_branch_enable(&switch_mm_always_ibpb);
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_AUTO:
@@ -757,8 +759,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
-
- spectre_v2_user_ibpb = mode;
}
/*
@@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
return 0;
}
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
@@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
+
/*
- * Indirect branch speculation is always disabled in strict
- * mode. It can neither be enabled if it was force-disabled
- * by a previous prctl call.
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
*/
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ if (!is_spec_ib_user_controlled() ||
task_spec_ib_force_disable(task))
return -EPERM;
+
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
break;
@@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+
+ if (!is_spec_ib_user_controlled())
return 0;
+
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
@@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
- return PR_SPEC_DISABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
+ else if (is_spec_ib_user_controlled()) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- } else
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
return PR_SPEC_NOT_AFFECTED;
}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 57074cf3ad7c..3ca9be482a9e 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -580,7 +580,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
if (index < 3)
return;
- node = amd_get_nb_id(smp_processor_id());
+ node = topology_die_id(smp_processor_id());
this_leaf->nb = node_to_amd_nb(node);
if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
amd_calc_l3_indices(this_leaf->nb);
@@ -646,7 +646,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -657,7 +657,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
@@ -684,7 +684,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
}
}
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..9215b91bc044 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..42af31b64c2c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
{}
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 29a3bedabd06..3b1b01f2b248 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,16 +93,41 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+static void clear_sgx_caps(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
+ setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
+}
+
+static int __init nosgx(char *str)
+{
+ clear_sgx_caps();
+
+ return 0;
+}
+
+early_param("nosgx", nosgx);
+
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
bool tboot = tboot_enabled();
+ bool enable_sgx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
+ clear_sgx_caps();
return;
}
+ /*
+ * Enable SGX if and only if the kernel supports SGX and Launch Control
+ * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
+ */
+ enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
+ cpu_has(c, X86_FEATURE_SGX_LC) &&
+ IS_ENABLED(CONFIG_X86_SGX);
+
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -124,13 +149,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
+ if (enable_sgx)
+ msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+
wrmsrl(MSR_IA32_FEAT_CTL, msr);
update_caps:
set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
if (!cpu_has(c, X86_FEATURE_VMX))
- return;
+ goto update_sgx;
if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
(!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
@@ -143,4 +171,12 @@ update_caps:
init_vmx_capabilities(c);
#endif
}
+
+update_sgx:
+ if (!(msr & FEAT_CTL_SGX_ENABLED) ||
+ !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
+ if (enable_sgx)
+ pr_err_once("SGX disabled by BIOS\n");
+ clear_sgx_caps();
+ }
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index ac6c30e5801d..ae59115d18f9 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -14,9 +14,6 @@
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
-#ifdef CONFIG_X86_64
-# include <asm/set_memory.h>
-#endif
#include "cpu.h"
@@ -65,7 +62,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c)
*/
static void hygon_get_topology(struct cpuinfo_x86 *c)
{
- u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
@@ -75,7 +71,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- node_id = ecx & 0xff;
+ c->cpu_die_id = ecx & 0xff;
c->cpu_core_id = ebx & 0xff;
@@ -93,14 +89,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
/* Socket ID is ApicId[6] for these processors. */
c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
- cacheinfo_hygon_init_llc_id(c, cpu, node_id);
+ cacheinfo_hygon_init_llc_id(c, cpu);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- node_id = value & 7;
+ c->cpu_die_id = value & 7;
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else
return;
@@ -123,7 +119,7 @@ static void hygon_detect_cmp(struct cpuinfo_x86 *c)
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
static void srat_detect_node(struct cpuinfo_x86 *c)
@@ -204,23 +200,6 @@ static void early_init_hygon_mc(struct cpuinfo_x86 *c)
static void bsp_init_hygon(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- pr_debug("tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
-#endif
-
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
u64 val;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 59a1e3ce3f14..0e422a544835 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,6 +24,7 @@
#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
+#include <asm/thermal.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
@@ -1159,6 +1162,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
{}
};
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c994154..015856abdbb1 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
mce-inject-y := inject.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 0c6b02dd744c..e486f96b3cb3 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1341,7 +1341,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
return -ENODEV;
if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ nb = node_to_amd_nb(topology_die_id(cpu));
/* threshold descriptor already initialized on this node? */
if (nb && nb->bank4) {
@@ -1445,7 +1445,7 @@ static void threshold_remove_bank(struct threshold_bank *bank)
* The last CPU on this node using the shared bank is going
* away, remove that bank now.
*/
- nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb = node_to_amd_nb(topology_die_id(smp_processor_id()));
nb->bank4 = NULL;
}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index af8d37962586..b58b85380ddb 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ unsigned int cpu;
+ struct mce m;
+
+ if (!boot_cpu_has(X86_FEATURE_SMCA))
+ return -EINVAL;
+
+ /*
+ * The starting address of the register array extracted from BERT must
+ * match with the first expected register in the register layout of
+ * SMCA address space. This address corresponds to banks's MCA_STATUS
+ * register.
+ *
+ * Match any MCi_STATUS register by turning off bank numbers.
+ */
+ if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
+ MSR_AMD64_SMCA_MC0_STATUS)
+ return -EINVAL;
+
+ /*
+ * The register array size must be large enough to include all the
+ * SMCA registers which need to be extracted.
+ *
+ * The number of registers in the register array is determined by
+ * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
+ * The register layout is fixed and currently the raw data in the
+ * register array includes 6 SMCA registers which the kernel can
+ * extract.
+ */
+ if (ctx_info->reg_arr_size < 48)
+ return -EINVAL;
+
+ mce_setup(&m);
+
+ m.extcpu = -1;
+ m.socketid = -1;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_data(cpu).initial_apicid == lapic_id) {
+ m.extcpu = cpu;
+ m.socketid = cpu_data(m.extcpu).phys_proc_id;
+ break;
+ }
+ }
+
+ m.apicid = lapic_id;
+ m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
+ m.status = *i_mce;
+ m.addr = *(i_mce + 1);
+ m.misc = *(i_mce + 2);
+ /* Skipping MCA_CONFIG */
+ m.ipid = *(i_mce + 4);
+ m.synd = *(i_mce + 5);
+
+ mce_log(&m);
+
+ return 0;
+}
+
#define CPER_CREATOR_MCE \
GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
0x64, 0x90, 0xb8, 0x9d)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..7962355436da 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log);
void mce_register_decode_chain(struct notifier_block *nb)
{
- if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
return;
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
@@ -877,6 +878,12 @@ static atomic_t mce_executing;
static atomic_t mce_callin;
/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
* Check if a timeout waiting for other CPUs happened.
*/
static int mce_timed_out(u64 *t, const char *msg)
@@ -893,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1)
+ if (mca_cfg.tolerant <= 1) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
+ }
cpu_missing = 1;
return 1;
}
@@ -1005,6 +1016,7 @@ static int mce_start(int *no_way_out)
* is updated before mce_callin.
*/
order = atomic_inc_return(&mce_callin);
+ cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/*
* Wait for everyone.
@@ -1113,6 +1125,7 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
barrier();
/*
@@ -1265,14 +1278,14 @@ static void kill_me_maybe(struct callback_head *cb)
}
}
-static void queue_task_work(struct mce *m, int kill_it)
+static void queue_task_work(struct mce *m, int kill_current_task)
{
current->mce_addr = m->addr;
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
- if (kill_it)
+ if (kill_current_task)
current->mce_kill_me.func = kill_me_now;
else
current->mce_kill_me.func = kill_me_maybe;
@@ -1320,10 +1333,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
int no_way_out = 0;
/*
- * If kill_it gets set, there might be a way to recover from this
+ * If kill_current_task is not set, there might be a way to recover from this
* error.
*/
- int kill_it = 0;
+ int kill_current_task = 0;
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
@@ -1350,8 +1363,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
-
+ kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
@@ -1368,7 +1380,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* to see it will clear it.
*/
if (lmce) {
- if (no_way_out)
+ if (no_way_out && cfg->tolerant < 3)
mce_panic("Fatal local machine check", &m, msg);
} else {
order = mce_start(&no_way_out);
@@ -1384,8 +1396,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
* When there's any problem use only local no_way_out state.
*/
if (!lmce) {
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out && cfg->tolerant < 3)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ }
} else {
/*
* If there was a fatal machine check we should have
@@ -1401,19 +1418,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
}
- /*
- * If tolerant is at an insane level we drop requests to kill
- * processes and continue even when there is no way out.
- */
- if (cfg->tolerant == 3)
- kill_it = 0;
- else if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
-
- if (worst > 0)
- irq_work_queue(&mce_irq_work);
-
- if (worst != MCE_AR_SEVERITY && !kill_it)
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
goto out;
/* Fault was in user mode and we need to take some action */
@@ -1421,7 +1426,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, kill_it);
+ queue_task_work(&m, kill_current_task);
} else {
/*
@@ -1439,7 +1444,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_it);
+ queue_task_work(&m, kill_current_task);
}
out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
@@ -1581,7 +1586,7 @@ static void __mcheck_cpu_mce_banks_init(void)
* __mcheck_cpu_init_clear_banks() does the final bank setup.
*/
b->ctl = -1ULL;
- b->init = 1;
+ b->init = true;
}
}
@@ -1762,7 +1767,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].init = 0;
+ mce_banks[0].init = false;
/*
* All newer Intel systems support MCE broadcasting. Enable
@@ -1811,11 +1816,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
return 1;
- break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
return 1;
- break;
default:
return 0;
}
@@ -1983,7 +1986,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- bool irq_state;
+ irqentry_state_t irq_state;
WARN_ON_ONCE(user_mode(regs));
@@ -1995,25 +1998,26 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
mce_check_crashing_cpu())
return;
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
/*
* The call targets are marked noinstr, but objtool can't figure
* that out because it's an indirect call. Annotate it.
*/
instrumentation_begin();
- trace_hardirqs_off_finish();
+
machine_check_vector(regs);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
+
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
+
machine_check_vector(regs);
+
instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
@@ -2186,7 +2190,6 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
@@ -2721,6 +2724,7 @@ static void mce_reset(void)
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
}
static int fake_panic_get(void *data, u64 *val)
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 3a44346f2276..7b360731fc2d 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -522,8 +522,8 @@ static void do_inject(void)
if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
b == 4 &&
boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
- cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ toggle_nb_mca_mst_cpu(topology_die_id(cpu));
+ cpu = get_nbc_for_node(topology_die_id(cpu));
}
get_online_cpus();
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index abe9fe0fb851..e309476743b7 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -509,12 +509,32 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
}
}
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+ u64 error_control;
+
+ switch (c->x86_model) {
+ case INTEL_FAM6_SANDYBRIDGE_X:
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
+ return;
+ error_control |= 2;
+ wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
+ break;
+ }
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
- intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
+ intel_imc_init(c);
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
deleted file mode 100644
index a7cd2d203ced..000000000000
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ /dev/null
@@ -1,739 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-#include "internal.h"
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-#define THERMAL_THROTTLING_EVENT 0
-#define POWER_LIMIT_EVENT 1
-
-/**
- * struct _thermal_state - Represent the current thermal event state
- * @next_check: Stores the next timestamp, when it is allowed
- * to log the next warning message.
- * @last_interrupt_time: Stores the timestamp for the last threshold
- * high event.
- * @therm_work: Delayed workqueue structure
- * @count: Stores the current running count for thermal
- * or power threshold interrupts.
- * @last_count: Stores the previous running count for thermal
- * or power threshold interrupts.
- * @max_time_ms: This shows the maximum amount of time CPU was
- * in throttled state for a single thermal
- * threshold high to low state.
- * @total_time_ms: This is a cumulative time during which CPU was
- * in the throttled state.
- * @rate_control_active: Set when a throttling message is logged.
- * This is used for the purpose of rate-control.
- * @new_event: Stores the last high/low status of the
- * THERM_STATUS_PROCHOT or
- * THERM_STATUS_POWER_LIMIT.
- * @level: Stores whether this _thermal_state instance is
- * for a CORE level or for PACKAGE level.
- * @sample_index: Index for storing the next sample in the buffer
- * temp_samples[].
- * @sample_count: Total number of samples collected in the buffer
- * temp_samples[].
- * @average: The last moving average of temperature samples
- * @baseline_temp: Temperature at which thermal threshold high
- * interrupt was generated.
- * @temp_samples: Storage for temperature samples to calculate
- * moving average.
- *
- * This structure is used to represent data related to thermal state for a CPU.
- * There is a separate storage for core and package level for each CPU.
- */
-struct _thermal_state {
- u64 next_check;
- u64 last_interrupt_time;
- struct delayed_work therm_work;
- unsigned long count;
- unsigned long last_count;
- unsigned long max_time_ms;
- unsigned long total_time_ms;
- bool rate_control_active;
- bool new_event;
- u8 level;
- u8 sample_index;
- u8 sample_count;
- u8 average;
- u8 baseline_temp;
- u8 temp_samples[3];
-};
-
-struct thermal_state {
- struct _thermal_state core_throttle;
- struct _thermal_state core_power_limit;
- struct _thermal_state package_throttle;
- struct _thermal_state package_power_limit;
- struct _thermal_state core_thresh0;
- struct _thermal_state core_thresh1;
- struct _thermal_state pkg_thresh0;
- struct _thermal_state pkg_thresh1;
-};
-
-/* Callback to handle core threshold interrupts */
-int (*platform_thermal_notify)(__u64 msr_val);
-EXPORT_SYMBOL(platform_thermal_notify);
-
-/* Callback to handle core package threshold_interrupts */
-int (*platform_thermal_package_notify)(__u64 msr_val);
-EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-bool (*platform_thermal_package_rate_control)(void);
-EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
-
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_device_one_ro(_name) \
- static DEVICE_ATTR(_name, 0444, \
- therm_throt_device_show_##_name, \
- NULL) \
-
-#define define_therm_throt_device_show_func(event, name) \
- \
-static ssize_t therm_throt_device_show_##event##_##name( \
- struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) { \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).event.name); \
- } else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_device_show_func(core_throttle, count);
-define_therm_throt_device_one_ro(core_throttle_count);
-
-define_therm_throt_device_show_func(core_power_limit, count);
-define_therm_throt_device_one_ro(core_power_limit_count);
-
-define_therm_throt_device_show_func(package_throttle, count);
-define_therm_throt_device_one_ro(package_throttle_count);
-
-define_therm_throt_device_show_func(package_power_limit, count);
-define_therm_throt_device_one_ro(package_power_limit_count);
-
-define_therm_throt_device_show_func(core_throttle, max_time_ms);
-define_therm_throt_device_one_ro(core_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, max_time_ms);
-define_therm_throt_device_one_ro(package_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(core_throttle, total_time_ms);
-define_therm_throt_device_one_ro(core_throttle_total_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, total_time_ms);
-define_therm_throt_device_one_ro(package_throttle_total_time_ms);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &dev_attr_core_throttle_count.attr,
- &dev_attr_core_throttle_max_time_ms.attr,
- &dev_attr_core_throttle_total_time_ms.attr,
- NULL
-};
-
-static const struct attribute_group thermal_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-#define CORE_LEVEL 0
-#define PACKAGE_LEVEL 1
-
-#define THERM_THROT_POLL_INTERVAL HZ
-#define THERM_STATUS_PROCHOT_LOG BIT(1)
-
-#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
-#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
-
-static void clear_therm_status_log(int level)
-{
- int msr;
- u64 mask, msr_val;
-
- if (level == CORE_LEVEL) {
- msr = MSR_IA32_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_CORE_MASK;
- } else {
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_PKG_MASK;
- }
-
- rdmsrl(msr, msr_val);
- msr_val &= mask;
- wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
-}
-
-static void get_therm_status(int level, bool *proc_hot, u8 *temp)
-{
- int msr;
- u64 msr_val;
-
- if (level == CORE_LEVEL)
- msr = MSR_IA32_THERM_STATUS;
- else
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
-
- rdmsrl(msr, msr_val);
- if (msr_val & THERM_STATUS_PROCHOT_LOG)
- *proc_hot = true;
- else
- *proc_hot = false;
-
- *temp = (msr_val >> 16) & 0x7F;
-}
-
-static void __maybe_unused throttle_active_work(struct work_struct *work)
-{
- struct _thermal_state *state = container_of(to_delayed_work(work),
- struct _thermal_state, therm_work);
- unsigned int i, avg, this_cpu = smp_processor_id();
- u64 now = get_jiffies_64();
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /* temperature value is offset from the max so lesser means hotter */
- if (!hot && temp > state->baseline_temp) {
- if (state->rate_control_active)
- pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
-
- state->rate_control_active = false;
- return;
- }
-
- if (time_before64(now, state->next_check) &&
- state->rate_control_active)
- goto re_arm;
-
- state->next_check = now + CHECK_INTERVAL;
-
- if (state->count != state->last_count) {
- /* There was one new thermal interrupt */
- state->last_count = state->count;
- state->average = 0;
- state->sample_count = 0;
- state->sample_index = 0;
- }
-
- state->temp_samples[state->sample_index] = temp;
- state->sample_count++;
- state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
- if (state->sample_count < ARRAY_SIZE(state->temp_samples))
- goto re_arm;
-
- avg = 0;
- for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
- avg += state->temp_samples[i];
-
- avg /= ARRAY_SIZE(state->temp_samples);
-
- if (state->average > avg) {
- pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- state->rate_control_active = true;
- }
-
- state->average = avg;
-
-re_arm:
- clear_therm_status_log(state->level);
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
-}
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- */
-static void therm_throt_process(bool new_event, int event, int level)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- bool old_event;
- u64 now;
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-
- now = get_jiffies_64();
- if (level == CORE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->core_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->core_power_limit;
- else
- return;
- } else if (level == PACKAGE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->package_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->package_power_limit;
- else
- return;
- } else
- return;
-
- old_event = state->new_event;
- state->new_event = new_event;
-
- if (new_event)
- state->count++;
-
- if (event != THERMAL_THROTTLING_EVENT)
- return;
-
- if (new_event && !state->last_interrupt_time) {
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /*
- * Ignore short temperature spike as the system is not close
- * to PROCHOT. 10C offset is large enough to ignore. It is
- * already dropped from the high threshold temperature.
- */
- if (temp > 10)
- return;
-
- state->baseline_temp = temp;
- state->last_interrupt_time = now;
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
- } else if (old_event && state->last_interrupt_time) {
- unsigned long throttle_time;
-
- throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
- if (throttle_time > state->max_time_ms)
- state->max_time_ms = throttle_time;
- state->total_time_ms += throttle_time;
- state->last_interrupt_time = 0;
- }
-}
-
-static int thresh_event_valid(int level, int event)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- u64 now = get_jiffies_64();
-
- if (level == PACKAGE_LEVEL)
- state = (event == 0) ? &pstate->pkg_thresh0 :
- &pstate->pkg_thresh1;
- else
- state = (event == 0) ? &pstate->core_thresh0 :
- &pstate->core_thresh1;
-
- if (time_before64(now, state->next_check))
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
-
- return 1;
-}
-
-static bool int_pln_enable;
-static int __init int_pln_enable_setup(char *s)
-{
- int_pln_enable = true;
-
- return 1;
-}
-__setup("int_pln_enable", int_pln_enable_setup);
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
-{
- int err;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
- if (err)
- return err;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_core_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_max_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_total_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
- }
-
- return 0;
-
-del_group:
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-
- return err;
-}
-
-static void thermal_throttle_remove_dev(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int thermal_throttle_online(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- state->package_throttle.level = PACKAGE_LEVEL;
- state->core_throttle.level = CORE_LEVEL;
-
- INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
- INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
-
- /* Unmask the thermal vector after the above workqueues are initialized. */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- return thermal_throttle_add_dev(dev, cpu);
-}
-
-static int thermal_throttle_offline(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- /* Mask the thermal vector before draining evtl. pending work */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
-
- cancel_delayed_work_sync(&state->package_throttle.therm_work);
- cancel_delayed_work_sync(&state->core_throttle.therm_work);
-
- state->package_throttle.rate_control_active = false;
- state->core_throttle.rate_control_active = false;
-
- thermal_throttle_remove_dev(dev);
- return 0;
-}
-
-static __init int thermal_throttle_init_device(void)
-{
- int ret;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
- thermal_throttle_online,
- thermal_throttle_offline);
- return ret < 0 ? ret : 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-static void notify_package_thresholds(__u64 msr_val)
-{
- bool notify_thres_0 = false;
- bool notify_thres_1 = false;
-
- if (!platform_thermal_package_notify)
- return;
-
- /* lower threshold check */
- if (msr_val & THERM_LOG_THRESHOLD0)
- notify_thres_0 = true;
- /* higher threshold check */
- if (msr_val & THERM_LOG_THRESHOLD1)
- notify_thres_1 = true;
-
- if (!notify_thres_0 && !notify_thres_1)
- return;
-
- if (platform_thermal_package_rate_control &&
- platform_thermal_package_rate_control()) {
- /* Rate control is implemented in callback */
- platform_thermal_package_notify(msr_val);
- return;
- }
-
- /* lower threshold reached */
- if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
- platform_thermal_package_notify(msr_val);
- /* higher threshold reached */
- if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
- platform_thermal_package_notify(msr_val);
-}
-
-static void notify_thresholds(__u64 msr_val)
-{
- /* check whether the interrupt handler is defined;
- * otherwise simply return
- */
- if (!platform_thermal_notify)
- return;
-
- /* lower threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD0) &&
- thresh_event_valid(CORE_LEVEL, 0))
- platform_thermal_notify(msr_val);
- /* higher threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD1) &&
- thresh_event_valid(CORE_LEVEL, 1))
- platform_thermal_notify(msr_val);
-}
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- if (static_cpu_has(X86_FEATURE_HWP))
- wrmsrl_safe(MSR_HWP_STATUS, 0);
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-
- /* Check for violation of core thermal thresholds*/
- notify_thresholds(msr_val);
-
- therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PTS)) {
- rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- /* check violations of package thermal thresholds */
- notify_package_thresholds(msr_val);
- therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL);
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val &
- PACKAGE_THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- PACKAGE_LEVEL);
- }
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
-{
- trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- ack_APIC_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return 0;
- return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- if (!intel_thermal_supported(c))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- h = lvtthmr_init;
- /*
- * The initial value of thermal LVT entries on all APs always reads
- * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
- * sequence to them and LVT registers are reset to 0s except for
- * the mask bits which are set to 1s when APs receive INIT IPI.
- * If BIOS takes over the thermal interrupt and sets its interrupt
- * delivery mode to SMI (not fixed), it restores the value that the
- * BIOS has programmed on AP based on BSP's info we saved since BIOS
- * is always setting the same value for all threads/cores.
- */
- if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
- apic_write(APIC_LVTTHMR, lvtthmr_init);
-
-
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- if (system_state == SYSTEM_BOOTING)
- pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- /* early Pentium M models use different method for enabling TM2 */
- if (cpu_has(c, X86_FEATURE_TM2)) {
- if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
- rdmsr(MSR_THERM2_CTL, l, h);
- if (l & MSR_THERM2_CTL_TM_SELECT)
- tm2 = 1;
- } else if (l & MSR_IA32_MISC_ENABLE_TM2)
- tm2 = 1;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- (l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- (l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE))
- & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE
- | PACKAGE_THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE), h);
- }
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3f6b137ef4e6..3d4a48336084 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -215,7 +215,6 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
default:
WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
return 0;
- break;
}
if (sh_psize > min_t(u32, buf_size, max_size))
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ec6f0415bc6d..b935e1b5f115 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-int __init microcode_init(void)
+static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 6a99535d7f37..7e8e07bddd5f 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -100,53 +100,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
return find_matching_signature(mc, csig, cpf);
}
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- *
- * %true - if there's a match
- * %false - otherwise
- */
-static bool microcode_matches(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- struct extended_sigtable *ext_header;
- unsigned int fam_ucode, model_ucode;
- struct extended_signature *ext_sig;
- unsigned int fam, model;
- int ext_sigcount, i;
-
- fam = x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return false;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- ext_sig++;
- }
- return false;
-}
-
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
@@ -164,7 +117,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size)
return p;
}
-static void save_microcode_patch(void *data, unsigned int size)
+static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
struct ucode_patch *iter, *tmp, *p = NULL;
@@ -210,6 +163,9 @@ static void save_microcode_patch(void *data, unsigned int size)
if (!p)
return;
+ if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
@@ -344,13 +300,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
size -= mc_size;
- if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ if (!find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
if (save) {
- save_microcode_patch(data, mc_size);
+ save_microcode_patch(uci, data, mc_size);
goto next;
}
@@ -483,14 +440,14 @@ static void show_saved_mc(void)
* Save this microcode patch. It will be loaded early when a CPU is
* hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc, unsigned int size)
+static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
{
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
mutex_lock(&x86_cpu_microcode_mutex);
- save_microcode_patch(mc, size);
+ save_microcode_patch(uci, mc, size);
show_saved_mc();
mutex_unlock(&x86_cpu_microcode_mutex);
@@ -935,7 +892,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc, new_mc_size);
+ save_mc_for_early(uci, new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 05ef1f4550cb..43b54bef5448 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -135,14 +135,32 @@ static void hv_machine_shutdown(void)
{
if (kexec_in_progress && hv_kexec_handler)
hv_kexec_handler();
+
+ /*
+ * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+ * corrupts the old VP Assist Pages and can crash the kexec kernel.
+ */
+ if (kexec_in_progress && hyperv_init_cpuhp > 0)
+ cpuhp_remove_state(hyperv_init_cpuhp);
+
+ /* The function calls stop_other_cpus(). */
native_machine_shutdown();
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ if (kexec_in_progress)
+ hyperv_cleanup();
}
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
+
+ /* The function calls crash_smp_send_stop(). */
native_machine_crash_shutdown(regs);
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ hyperv_cleanup();
}
#endif /* CONFIG_KEXEC_CORE */
#endif /* CONFIG_HYPERV */
@@ -366,9 +384,38 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
+static bool __init ms_hyperv_x2apic_available(void)
+{
+ return x2apic_supported();
+}
+
+/*
+ * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
+ * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
+ * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
+ *
+ * Note: for a VM on Hyper-V, the I/O-APIC is the only device which
+ * (logically) generates MSIs directly to the system APIC irq domain.
+ * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
+ * pci-hyperv host bridge.
+ */
+static bool __init ms_hyperv_msi_ext_dest_id(void)
+{
+ u32 eax;
+
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
+ if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
+ return false;
+
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+}
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
.type = X86_HYPER_MS_HYPERV,
+ .init.x2apic_available = ms_hyperv_x2apic_available,
+ .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
};
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5bd011737272..9231640782fa 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 23ad8e953dfb..b90f3f437765 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
#include <linux/export.h>
#include <linux/init.h>
@@ -167,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
*repeat = 0;
*uniform = 1;
- /* Make end inclusive instead of exclusive */
- end--;
-
prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state, inclusive;
@@ -261,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
int repeat;
u64 partial_end;
+ /* Make end inclusive instead of exclusive */
+ end--;
+
if (!mtrr_state_set)
return MTRR_TYPE_INVALID;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 6a80f36b5d59..28c8a23aa42e 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
@@ -794,8 +792,6 @@ void mtrr_ap_init(void)
if (!use_intel() || mtrr_aps_delayed_init)
return;
- rcu_cpu_starting(smp_processor_id());
-
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
* changed, but this routine will be called in cpu boot time,
@@ -813,7 +809,8 @@ void mtrr_ap_init(void)
}
/**
- * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
+ * mtrr_save_state - Save current fixed-range MTRR state of the first
+ * cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index a5ee607a3b89..3ef5868ac588 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -3,7 +3,7 @@
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index e5f4ee8f4c3b..698bb26aeb6e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -570,6 +570,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
if (d) {
cpumask_set_cpu(cpu, &d->cpu_mask);
+ if (r->cache.arch_has_per_cpu_cfg)
+ rdt_domain_reconfigure_cdp(r);
return;
}
@@ -893,6 +895,10 @@ static __init void __check_quirks_intel(void)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
+ fallthrough;
+ case INTEL_FAM6_BROADWELL_X:
+ intel_rdt_mbm_apply_quirk();
+ break;
}
}
@@ -923,6 +929,7 @@ static __init void rdt_init_res_defs_intel(void)
r->rid == RDT_RESOURCE_L2CODE) {
r->cache.arch_has_sparse_bitmaps = false;
r->cache.arch_has_empty_bitmaps = false;
+ r->cache.arch_has_per_cpu_cfg = false;
} else if (r->rid == RDT_RESOURCE_MBA) {
r->msr_base = MSR_IA32_MBA_THRTL_BASE;
r->msr_update = mba_wrmsr_intel;
@@ -943,6 +950,7 @@ static __init void rdt_init_res_defs_amd(void)
r->rid == RDT_RESOURCE_L2CODE) {
r->cache.arch_has_sparse_bitmaps = true;
r->cache.arch_has_empty_bitmaps = true;
+ r->cache.arch_has_per_cpu_cfg = true;
} else if (r->rid == RDT_RESOURCE_MBA) {
r->msr_base = MSR_IA32_MBA_BW_BASE;
r->msr_update = mba_wrmsr_amd;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 80fa997fae60..c4d320d02fd5 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -264,7 +264,7 @@ void __exit rdtgroup_exit(void);
struct rftype {
char *name;
umode_t mode;
- struct kernfs_ops *kf_ops;
+ const struct kernfs_ops *kf_ops;
unsigned long flags;
unsigned long fflags;
@@ -360,6 +360,8 @@ struct msr_param {
* executing entities
* @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid.
* @arch_has_empty_bitmaps: True if the '0' bitmap is valid.
+ * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache
+ * level has CPU scope.
*/
struct rdt_cache {
unsigned int cbm_len;
@@ -369,6 +371,7 @@ struct rdt_cache {
unsigned int shareable_bits;
bool arch_has_sparse_bitmaps;
bool arch_has_empty_bitmaps;
+ bool arch_has_per_cpu_cfg;
};
/**
@@ -569,6 +572,7 @@ union cpuid_0x10_x_edx {
void rdt_last_cmd_clear(void);
void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
@@ -616,6 +620,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms);
void mbm_handle_overflow(struct work_struct *work);
+void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 54dffe574e67..7ac31210e452 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -64,6 +64,69 @@ unsigned int rdt_mon_features;
*/
unsigned int resctrl_cqm_threshold;
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initdata = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
static inline struct rmid_entry *__rmid_entry(u32 rmid)
{
struct rmid_entry *entry;
@@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
m->chunks += chunks;
m->prev_msr = tval;
- rr->val += m->chunks;
+ rr->val += get_corrected_mbm_count(rmid, m->chunks);
+
return 0;
}
@@ -279,8 +343,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
return;
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
- m->chunks += chunks;
- cur_bw = (chunks * r->mon_scale) >> 20;
+ cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20;
if (m->delta_comp)
m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -450,15 +513,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
/*
* Call the MBA software controller only for the
* control groups and when user has enabled
* the software controller explicitly.
*/
- if (!is_mba_sc(NULL))
- __mon_event_count(rmid, &rr);
- else
+ if (is_mba_sc(NULL))
mbm_bw_count(rmid, &rr);
}
}
@@ -644,3 +706,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
return 0;
}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 0daf2f1cf7a8..e916646adc69 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
{
/* Not supported */
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index af323e2e3100..f9190adc52cb 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
}
-static struct kernfs_ops rdtgroup_kf_single_ops = {
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
.write = rdtgroup_file_write,
.seq_show = rdtgroup_seqfile_show,
};
-static struct kernfs_ops kf_mondata_ops = {
+static const struct kernfs_ops kf_mondata_ops = {
.atomic_write_len = PAGE_SIZE,
.seq_show = rdtgroup_mondata_show,
};
@@ -507,89 +507,88 @@ unlock:
return ret ?: nbytes;
}
-struct task_move_callback {
- struct callback_head work;
- struct rdtgroup *rdtgrp;
-};
-
-static void move_myself(struct callback_head *head)
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
{
- struct task_move_callback *callback;
- struct rdtgroup *rdtgrp;
-
- callback = container_of(head, struct task_move_callback, work);
- rdtgrp = callback->rdtgrp;
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+static void _update_task_closid_rmid(void *task)
+{
/*
- * If resource group was deleted before this task work callback
- * was invoked, then assign the task to root group and free the
- * resource group.
+ * If the task is still current on this CPU, update PQR_ASSOC MSR.
+ * Otherwise, the MSR is updated when the task is scheduled in.
*/
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- current->closid = 0;
- current->rmid = 0;
- kfree(rdtgrp);
- }
-
- if (unlikely(current->flags & PF_EXITING))
- goto out;
-
- preempt_disable();
- /* update PQR_ASSOC MSR to make resource group go into effect */
- resctrl_sched_in();
- preempt_enable();
+ if (task == current)
+ resctrl_sched_in();
+}
-out:
- kfree(callback);
+static void update_task_closid_rmid(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
+ smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
+ else
+ _update_task_closid_rmid(t);
}
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
- struct task_move_callback *callback;
- int ret;
-
- callback = kzalloc(sizeof(*callback), GFP_KERNEL);
- if (!callback)
- return -ENOMEM;
- callback->work.func = move_myself;
- callback->rdtgrp = rdtgrp;
+ /* If the task is already in rdtgrp, no need to move the task. */
+ if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
+ tsk->rmid == rdtgrp->mon.rmid) ||
+ (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
+ tsk->closid == rdtgrp->mon.parent->closid))
+ return 0;
/*
- * Take a refcount, so rdtgrp cannot be freed before the
- * callback has been invoked.
+ * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+ * updated by them.
+ *
+ * For ctrl_mon groups, move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
*/
- atomic_inc(&rdtgrp->waitcount);
- ret = task_work_add(tsk, &callback->work, TWA_RESUME);
- if (ret) {
- /*
- * Task is exiting. Drop the refcount and free the callback.
- * No need to check the refcount as the group cannot be
- * deleted before the write function unlocks rdtgroup_mutex.
- */
- atomic_dec(&rdtgrp->waitcount);
- kfree(callback);
- rdt_last_cmd_puts("Task exited\n");
- } else {
- /*
- * For ctrl_mon groups move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
- tsk->rmid = rdtgrp->mon.rmid;
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid) {
- tsk->rmid = rdtgrp->mon.rmid;
- } else {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- ret = -EINVAL;
- }
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ WRITE_ONCE(tsk->closid, rdtgrp->closid);
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid) {
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
+ } else {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
}
}
- return ret;
+
+ /*
+ * Ensure the task's closid and rmid are written before determining if
+ * the task is current that will decide if it will be interrupted.
+ */
+ barrier();
+
+ /*
+ * By now, the task's closid and rmid are set. If the task is current
+ * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+ * group go into effect. If the task is not current, the MSR will be
+ * updated when the task is scheduled in.
+ */
+ update_task_closid_rmid(tsk);
+
+ return 0;
}
static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
@@ -1769,7 +1768,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
if (IS_ERR(kn_subdir))
return PTR_ERR(kn_subdir);
- kernfs_get(kn_subdir);
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
return ret;
@@ -1792,7 +1790,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- kernfs_get(kn_info);
ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
if (ret)
@@ -1813,12 +1810,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
goto out_destroy;
}
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn_info);
-
ret = rdtgroup_kn_set_ugid(kn_info);
if (ret)
goto out_destroy;
@@ -1847,12 +1838,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
if (dest_kn)
*dest_kn = kn;
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -1905,8 +1890,13 @@ static int set_cache_qos_cfg(int level, bool enable)
r_l = &rdt_resources_all[level];
list_for_each_entry(d, &r_l->domains, list) {
- /* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ if (r_l->cache.arch_has_per_cpu_cfg)
+ /* Pick all the CPUs in the domain instance */
+ for_each_cpu(cpu, &d->cpu_mask)
+ cpumask_set_cpu(cpu, cpu_mask);
+ else
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
cpu = get_cpu();
/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
@@ -2079,8 +2069,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
rdtgroup_pseudo_lock_remove(rdtgrp);
kernfs_unbreak_active_protection(kn);
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
} else {
kernfs_unbreak_active_protection(kn);
}
@@ -2139,13 +2128,11 @@ static int rdt_get_tree(struct fs_context *fc)
&kn_mongrp);
if (ret < 0)
goto out_info;
- kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
if (ret < 0)
goto out_mongrp;
- kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
@@ -2323,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- t->closid = to->closid;
- t->rmid = to->mon.rmid;
+ WRITE_ONCE(t->closid, to->closid);
+ WRITE_ONCE(t->rmid, to->mon.rmid);
-#ifdef CONFIG_SMP
/*
- * This is safe on x86 w/o barriers as the ordering
- * of writing to task_cpu() and t->on_cpu is
- * reverse to the reading here. The detection is
- * inaccurate as tasks might move or schedule
- * before the smp function call takes place. In
- * such a case the function call is pointless, but
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
* there is no other side effect.
*/
- if (mask && t->on_cpu)
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
-#endif
}
}
read_unlock(&tasklist_lock);
@@ -2357,7 +2340,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
if (atomic_read(&sentry->waitcount) != 0)
sentry->flags = RDT_DELETED;
else
- kfree(sentry);
+ rdtgroup_remove(sentry);
}
}
@@ -2399,7 +2382,7 @@ static void rmdir_all_sub(void)
if (atomic_read(&rdtgrp->waitcount) != 0)
rdtgrp->flags = RDT_DELETED;
else
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2499,11 +2482,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
if (IS_ERR(kn))
return PTR_ERR(kn);
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that kn is always accessible.
- */
- kernfs_get(kn);
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -2838,8 +2816,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
/*
* kernfs_remove() will drop the reference count on "kn" which
* will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn} call below. Take one extra reference
- * here, which will be dropped inside rdtgroup_kn_unlock().
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
*/
kernfs_get(kn);
@@ -2880,6 +2858,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
out_idfree:
free_rmid(rdtgrp->mon.rmid);
out_destroy:
+ kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
out_free_rgrp:
kfree(rdtgrp);
@@ -2892,7 +2871,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
free_rmid(rgrp->mon.rmid);
- kfree(rgrp);
+ rdtgroup_remove(rgrp);
}
/*
@@ -3021,8 +3000,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
return -EPERM;
}
-static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
- cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
int cpu;
@@ -3049,33 +3027,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
list_del(&rdtgrp->mon.crdtgrp_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
}
-static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
- struct rdtgroup *rdtgrp)
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
{
rdtgrp->flags = RDT_DELETED;
list_del(&rdtgrp->rdtgroup_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
}
-static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
- cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
int cpu;
@@ -3102,7 +3068,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
- rdtgroup_ctrl_remove(kn, rdtgrp);
+ rdtgroup_ctrl_remove(rdtgrp);
/*
* Free all the child monitor group rmids.
@@ -3139,13 +3105,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
rdtgrp != &rdtgroup_default) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+ ret = rdtgroup_ctrl_remove(rdtgrp);
} else {
- ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
}
} else if (rdtgrp->type == RDTMON_GROUP &&
is_mon_groups(parent_kn, kn->name)) {
- ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
} else {
ret = -EPERM;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 866c9a9bcdee..972ec3bfa9c0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,10 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
- { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
- { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
- { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..91d3dc784a29
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,5 @@
+obj-y += \
+ driver.o \
+ encl.o \
+ ioctl.o \
+ main.o
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h
new file mode 100644
index 000000000000..dd7602c44c72
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/arch.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains data structures defined by the SGX architecture. Data structures
+ * defined by the Linux software stack should not be placed here.
+ */
+#ifndef _ASM_X86_SGX_ARCH_H
+#define _ASM_X86_SGX_ARCH_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/* The SGX specific CPUID function. */
+#define SGX_CPUID 0x12
+/* EPC enumeration. */
+#define SGX_CPUID_EPC 2
+/* An invalid EPC section, i.e. the end marker. */
+#define SGX_CPUID_EPC_INVALID 0x0
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION 0x1
+/* The bitmask for the EPC section type. */
+#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+
+/**
+ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
+ * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
+ * been completed yet.
+ * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
+ * public key does not match IA32_SGXLEPUBKEYHASH.
+ * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
+ */
+enum sgx_return_code {
+ SGX_NOT_TRACKED = 11,
+ SGX_INVALID_EINITTOKEN = 16,
+ SGX_UNMASKED_EVENT = 128,
+};
+
+/* The modulus size for 3072-bit RSA keys. */
+#define SGX_MODULUS_SIZE 384
+
+/**
+ * enum sgx_miscselect - additional information to an SSA frame
+ * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
+ *
+ * Save State Area (SSA) is a stack inside the enclave used to store processor
+ * state when an exception or interrupt occurs. This enum defines additional
+ * information stored to an SSA frame.
+ */
+enum sgx_miscselect {
+ SGX_MISC_EXINFO = BIT(0),
+};
+
+#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
+
+#define SGX_SSA_GPRS_SIZE 184
+#define SGX_SSA_MISC_EXINFO_SIZE 16
+
+/**
+ * enum sgx_attributes - the attributes field in &struct sgx_secs
+ * %SGX_ATTR_INIT: Enclave can be entered (is initialized).
+ * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
+ * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
+ * attestation.
+ * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
+ * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
+ * sign cryptographic tokens that can be passed to
+ * EINIT as an authorization to run an enclave.
+ */
+enum sgx_attribute {
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ SGX_ATTR_KSS = BIT(7),
+};
+
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+
+/**
+ * struct sgx_secs - SGX Enclave Control Structure (SECS)
+ * @size: size of the address space
+ * @base: base address of the address space
+ * @ssa_frame_size: size of an SSA frame
+ * @miscselect: additional information stored to an SSA frame
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT
+ * @config_id: a user-defined value that is used in key derivation
+ * @isv_prod_id: a user-defined value that is used in key derivation
+ * @isv_svn: a user-defined value that is used in key derivation
+ * @config_svn: a user-defined value that is used in key derivation
+ *
+ * SGX Enclave Control Structure (SECS) is a special enclave page that is not
+ * visible in the address space. In fact, this structure defines the address
+ * range and other global attributes for the enclave and it is the first EPC
+ * page created for any enclave. It is moved from a temporary buffer to an EPC
+ * by the means of ENCLS[ECREATE] function.
+ */
+struct sgx_secs {
+ u64 size;
+ u64 base;
+ u32 ssa_frame_size;
+ u32 miscselect;
+ u8 reserved1[24];
+ u64 attributes;
+ u64 xfrm;
+ u32 mrenclave[8];
+ u8 reserved2[32];
+ u32 mrsigner[8];
+ u8 reserved3[32];
+ u32 config_id[16];
+ u16 isv_prod_id;
+ u16 isv_svn;
+ u16 config_svn;
+ u8 reserved4[3834];
+} __packed;
+
+/**
+ * enum sgx_tcs_flags - execution flags for TCS
+ * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
+ * inside an enclave. It is cleared by EADD but can
+ * be set later with EDBGWR.
+ */
+enum sgx_tcs_flags {
+ SGX_TCS_DBGOPTIN = 0x01,
+};
+
+#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1)
+#define SGX_TCS_RESERVED_SIZE 4024
+
+/**
+ * struct sgx_tcs - Thread Control Structure (TCS)
+ * @state: used to mark an entered TCS
+ * @flags: execution flags (cleared by EADD)
+ * @ssa_offset: SSA stack offset relative to the enclave base
+ * @ssa_index: the current SSA frame index (cleard by EADD)
+ * @nr_ssa_frames: the number of frame in the SSA stack
+ * @entry_offset: entry point offset relative to the enclave base
+ * @exit_addr: address outside the enclave to exit on an exception or
+ * interrupt
+ * @fs_offset: offset relative to the enclave base to become FS
+ * segment inside the enclave
+ * @gs_offset: offset relative to the enclave base to become GS
+ * segment inside the enclave
+ * @fs_limit: size to become a new FS-limit (only 32-bit enclaves)
+ * @gs_limit: size to become a new GS-limit (only 32-bit enclaves)
+ *
+ * Thread Control Structure (TCS) is an enclave page visible in its address
+ * space that defines an entry point inside the enclave. A thread enters inside
+ * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
+ * by only one thread at a time.
+ */
+struct sgx_tcs {
+ u64 state;
+ u64 flags;
+ u64 ssa_offset;
+ u32 ssa_index;
+ u32 nr_ssa_frames;
+ u64 entry_offset;
+ u64 exit_addr;
+ u64 fs_offset;
+ u64 gs_offset;
+ u32 fs_limit;
+ u32 gs_limit;
+ u8 reserved[SGX_TCS_RESERVED_SIZE];
+} __packed;
+
+/**
+ * struct sgx_pageinfo - an enclave page descriptor
+ * @addr: address of the enclave page
+ * @contents: pointer to the page contents
+ * @metadata: pointer either to a SECINFO or PCMD instance
+ * @secs: address of the SECS page
+ */
+struct sgx_pageinfo {
+ u64 addr;
+ u64 contents;
+ u64 metadata;
+ u64 secs;
+} __packed __aligned(32);
+
+
+/**
+ * enum sgx_page_type - bits in the SECINFO flags defining the page type
+ * %SGX_PAGE_TYPE_SECS: a SECS page
+ * %SGX_PAGE_TYPE_TCS: a TCS page
+ * %SGX_PAGE_TYPE_REG: a regular page
+ * %SGX_PAGE_TYPE_VA: a VA page
+ * %SGX_PAGE_TYPE_TRIM: a page in trimmed state
+ */
+enum sgx_page_type {
+ SGX_PAGE_TYPE_SECS,
+ SGX_PAGE_TYPE_TCS,
+ SGX_PAGE_TYPE_REG,
+ SGX_PAGE_TYPE_VA,
+ SGX_PAGE_TYPE_TRIM,
+};
+
+#define SGX_NR_PAGE_TYPES 5
+#define SGX_PAGE_TYPE_MASK GENMASK(7, 0)
+
+/**
+ * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
+ * %SGX_SECINFO_R: allow read
+ * %SGX_SECINFO_W: allow write
+ * %SGX_SECINFO_X: allow execution
+ * %SGX_SECINFO_SECS: a SECS page
+ * %SGX_SECINFO_TCS: a TCS page
+ * %SGX_SECINFO_REG: a regular page
+ * %SGX_SECINFO_VA: a VA page
+ * %SGX_SECINFO_TRIM: a page in trimmed state
+ */
+enum sgx_secinfo_flags {
+ SGX_SECINFO_R = BIT(0),
+ SGX_SECINFO_W = BIT(1),
+ SGX_SECINFO_X = BIT(2),
+ SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8),
+ SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8),
+ SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8),
+ SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8),
+ SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0)
+#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8)
+#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \
+ SGX_SECINFO_PAGE_TYPE_MASK)
+
+/**
+ * struct sgx_secinfo - describes attributes of an EPC page
+ * @flags: permissions and type
+ *
+ * Used together with ENCLS leaves that add or modify an EPC page to an
+ * enclave to define page permissions and type.
+ */
+struct sgx_secinfo {
+ u64 flags;
+ u8 reserved[56];
+} __packed __aligned(64);
+
+#define SGX_PCMD_RESERVED_SIZE 40
+
+/**
+ * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
+ * @enclave_id: enclave identifier
+ * @mac: MAC over PCMD, page contents and isvsvn
+ *
+ * PCMD is stored for every swapped page to the regular memory. When ELDU loads
+ * the page back it recalculates the MAC by using a isvsvn number stored in a
+ * VA page. Together these two structures bring integrity and rollback
+ * protection.
+ */
+struct sgx_pcmd {
+ struct sgx_secinfo secinfo;
+ u64 enclave_id;
+ u8 reserved[SGX_PCMD_RESERVED_SIZE];
+ u8 mac[16];
+} __packed __aligned(128);
+
+#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
+#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
+#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
+#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
+
+/**
+ * struct sgx_sigstruct_header - defines author of the enclave
+ * @header1: constant byte string
+ * @vendor: must be either 0x0000 or 0x8086
+ * @date: YYYYMMDD in BCD
+ * @header2: costant byte string
+ * @swdefined: software defined value
+ */
+struct sgx_sigstruct_header {
+ u64 header1[2];
+ u32 vendor;
+ u32 date;
+ u64 header2[2];
+ u32 swdefined;
+ u8 reserved1[84];
+} __packed;
+
+/**
+ * struct sgx_sigstruct_body - defines contents of the enclave
+ * @miscselect: additional information stored to an SSA frame
+ * @misc_mask: required miscselect in SECS
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @attributes_mask: required attributes in SECS
+ * @xfrm_mask: required XFRM in SECS
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @isvprodid: a user-defined value that is used in key derivation
+ * @isvsvn: a user-defined value that is used in key derivation
+ */
+struct sgx_sigstruct_body {
+ u32 miscselect;
+ u32 misc_mask;
+ u8 reserved2[20];
+ u64 attributes;
+ u64 xfrm;
+ u64 attributes_mask;
+ u64 xfrm_mask;
+ u8 mrenclave[32];
+ u8 reserved3[32];
+ u16 isvprodid;
+ u16 isvsvn;
+} __packed;
+
+/**
+ * struct sgx_sigstruct - an enclave signature
+ * @header: defines author of the enclave
+ * @modulus: the modulus of the public key
+ * @exponent: the exponent of the public key
+ * @signature: the signature calculated over the fields except modulus,
+ * @body: defines contents of the enclave
+ * @q1: a value used in RSA signature verification
+ * @q2: a value used in RSA signature verification
+ *
+ * Header and body are the parts that are actual signed. The remaining fields
+ * define the signature of the enclave.
+ */
+struct sgx_sigstruct {
+ struct sgx_sigstruct_header header;
+ u8 modulus[SGX_MODULUS_SIZE];
+ u32 exponent;
+ u8 signature[SGX_MODULUS_SIZE];
+ struct sgx_sigstruct_body body;
+ u8 reserved4[12];
+ u8 q1[SGX_MODULUS_SIZE];
+ u8 q2[SGX_MODULUS_SIZE];
+} __packed;
+
+#define SGX_LAUNCH_TOKEN_SIZE 304
+
+#endif /* _ASM_X86_SGX_ARCH_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..8ce6d8371cfb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_misc_reserved_mask;
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl;
+ int ret;
+
+ encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+ if (!encl)
+ return -ENOMEM;
+
+ kref_init(&encl->refcount);
+ xa_init(&encl->page_array);
+ mutex_init(&encl->lock);
+ INIT_LIST_HEAD(&encl->va_pages);
+ INIT_LIST_HEAD(&encl->mm_list);
+ spin_lock_init(&encl->mm_lock);
+
+ ret = init_srcu_struct(&encl->srcu);
+ if (ret) {
+ kfree(encl);
+ return ret;
+ }
+
+ file->private_data = encl;
+
+ return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Drain the remaining mm_list entries. At this point the list contains
+ * entries for processes, which have closed the enclave file but have
+ * not exited yet. The processes, which have exited, are gone from the
+ * list by sgx_mmu_notifier_release().
+ */
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
+ }
+
+ kref_put(&encl->refcount, sgx_encl_release);
+ return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = file->private_data;
+ int ret;
+
+ ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+ if (ret)
+ return ret;
+
+ ret = sgx_encl_mm_add(encl, vma->vm_mm);
+ if (ret)
+ return ret;
+
+ vma->vm_ops = &sgx_vm_ops;
+ vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+ vma->vm_private_data = encl;
+
+ return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if ((flags & MAP_TYPE) == MAP_PRIVATE)
+ return -EINVAL;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations sgx_encl_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_open,
+ .release = sgx_release,
+ .unlocked_ioctl = sgx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sgx_compat_ioctl,
+#endif
+ .mmap = sgx_mmap,
+ .get_unmapped_area = sgx_get_unmapped_area,
+};
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_enclave",
+ .nodename = "sgx_enclave",
+ .fops = &sgx_encl_fops,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 attr_mask;
+ u64 xfrm_mask;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ return -ENODEV;
+
+ cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ if (!(eax & 1)) {
+ pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ return -ENODEV;
+ }
+
+ sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+
+ cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+ attr_mask = (((u64)ebx) << 32) + (u64)eax;
+ sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+ if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+ sgx_xfrm_reserved_mask = ~xfrm_mask;
+ }
+
+ ret = misc_register(&sgx_dev_enclave);
+ if (ret)
+ return ret;
+
+ ret = misc_register(&sgx_dev_provision);
+ if (ret) {
+ misc_deregister(&sgx_dev_enclave);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..4eddb4d571ef
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <crypto/hash.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT 20
+#define SGX_EINIT_SLEEP_COUNT 50
+#define SGX_EINIT_SLEEP_TIME 20
+
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_misc_reserved_mask;
+
+extern const struct file_operations sgx_provision_fops;
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..7449ef33f081
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include "arch.h"
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+/*
+ * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
+ * Pages" in the SDM.
+ */
+static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_epc_page *secs_page)
+{
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_pageinfo pginfo;
+ struct sgx_backing b;
+ pgoff_t page_index;
+ int ret;
+
+ if (secs_page)
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ else
+ page_index = PFN_DOWN(encl->size);
+
+ ret = sgx_encl_get_backing(encl, page_index, &b);
+ if (ret)
+ return ret;
+
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.contents = (unsigned long)kmap_atomic(b.contents);
+ pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
+ b.pcmd_offset;
+
+ if (secs_page)
+ pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
+ else
+ pginfo.secs = 0;
+
+ ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
+ sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ELDU");
+
+ ret = -EFAULT;
+ }
+
+ kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+ kunmap_atomic((void *)(unsigned long)pginfo.contents);
+
+ sgx_encl_put_backing(&b, false);
+
+ return ret;
+}
+
+static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *secs_page)
+{
+
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page))
+ return epc_page;
+
+ ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
+ if (ret) {
+ sgx_free_epc_page(epc_page);
+ return ERR_PTR(ret);
+ }
+
+ sgx_free_va_slot(encl_page->va_page, va_offset);
+ list_move(&encl_page->va_page->list, &encl->va_pages);
+ encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ encl_page->epc_page = epc_page;
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the faulted page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ if (!(encl->secs.epc_page)) {
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+ }
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
+{
+ unsigned long addr = (unsigned long)vmf->address;
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_encl_page *entry;
+ unsigned long phys_addr;
+ struct sgx_encl *encl;
+ vm_fault_t ret;
+
+ encl = vma->vm_private_data;
+
+ /*
+ * It's very unlikely but possible that allocating memory for the
+ * mm_list entry of a forked process failed in sgx_vma_open(). When
+ * this happens, vm_private_data is set to NULL.
+ */
+ if (unlikely(!encl))
+ return VM_FAULT_SIGBUS;
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
+ if (IS_ERR(entry)) {
+ mutex_unlock(&encl->lock);
+
+ if (PTR_ERR(entry) == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
+
+ ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (ret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ sgx_encl_test_and_clear_young(vma->vm_mm, entry);
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+
+ /*
+ * It's possible but unlikely that vm_private_data is NULL. This can
+ * happen in a grandchild of a process, when sgx_encl_mm_add() had
+ * failed to allocate memory in this callback.
+ */
+ if (unlikely(!encl))
+ return;
+
+ if (sgx_encl_mm_add(encl, vma->vm_mm))
+ vma->vm_private_data = NULL;
+}
+
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl: an enclave pointer
+ * @start: lower bound of the address range, inclusive
+ * @end: upper bound of the address range, exclusive
+ * @vm_flags: VMA flags
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
+ * do not contain any permissions that are not contained in the build time
+ * permissions of any of the enclave pages within the given address range.
+ *
+ * An enclave creator must declare the strongest permissions that will be
+ * needed for each enclave page. This ensures that mappings have the identical
+ * or weaker permissions than the earlier declared permissions.
+ *
+ * Return: 0 on success, -EACCES otherwise
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ struct sgx_encl_page *page;
+ unsigned long count = 0;
+ int ret = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+
+ /*
+ * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+ * conflict with the enclave page permissions.
+ */
+ if (current->personality & READ_IMPLIES_EXEC)
+ return -EACCES;
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
+ if (~page->vm_max_prot_bits & vm_prot_bits) {
+ ret = -EACCES;
+ break;
+ }
+
+ /* Reschedule on every XA_CHECK_SCHED iteration. */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ cond_resched();
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ return ret;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+}
+
+static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+
+ ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+ ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Load an enclave page to EPC if required, and take encl->lock.
+ */
+static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ struct sgx_encl_page *entry;
+
+ for ( ; ; ) {
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr, vm_flags);
+ if (PTR_ERR(entry) != -EBUSY)
+ break;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ if (IS_ERR(entry))
+ mutex_unlock(&encl->lock);
+
+ return entry;
+}
+
+static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+ struct sgx_encl_page *entry = NULL;
+ char data[sizeof(unsigned long)];
+ unsigned long align;
+ int offset;
+ int cnt;
+ int ret = 0;
+ int i;
+
+ /*
+ * If process was forked, VMA is still there but vm_private_data is set
+ * to NULL.
+ */
+ if (!encl)
+ return -EFAULT;
+
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ return -EFAULT;
+
+ for (i = 0; i < len; i += cnt) {
+ entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
+ vma->vm_flags);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ break;
+ }
+
+ align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
+ offset = (addr + i) & (sizeof(unsigned long) - 1);
+ cnt = sizeof(unsigned long) - offset;
+ cnt = min(cnt, len - i);
+
+ ret = sgx_encl_debug_read(encl, entry, align, data);
+ if (ret)
+ goto out;
+
+ if (write) {
+ memcpy(data + offset, buf + i, cnt);
+ ret = sgx_encl_debug_write(encl, entry, align, data);
+ if (ret)
+ goto out;
+ } else {
+ memcpy(buf + i, data + offset, cnt);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+
+ if (ret)
+ break;
+ }
+
+ return ret < 0 ? ret : i;
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+ .fault = sgx_vma_fault,
+ .mprotect = sgx_vma_mprotect,
+ .open = sgx_vma_open,
+ .access = sgx_vma_access,
+};
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @kref: address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+ struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ struct sgx_va_page *va_page;
+ struct sgx_encl_page *entry;
+ unsigned long index;
+
+ xa_for_each(&encl->page_array, index, entry) {
+ if (entry->epc_page) {
+ /*
+ * The page and its radix tree entry cannot be freed
+ * if the page is being held by the reclaimer.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page))
+ continue;
+
+ sgx_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ }
+
+ kfree(entry);
+ }
+
+ xa_destroy(&encl->page_array);
+
+ if (!encl->secs_child_cnt && encl->secs.epc_page) {
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+ }
+
+ while (!list_empty(&encl->va_pages)) {
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ list_del(&va_page->list);
+ sgx_free_epc_page(va_page->epc_page);
+ kfree(va_page);
+ }
+
+ if (encl->backing)
+ fput(encl->backing);
+
+ cleanup_srcu_struct(&encl->srcu);
+
+ WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+ /* Detect EPC page leak's. */
+ WARN_ON_ONCE(encl->secs_child_cnt);
+ WARN_ON_ONCE(encl->secs.epc_page);
+
+ kfree(encl);
+}
+
+/*
+ * 'mm' is exiting and no longer needs mmu notifications.
+ */
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
+ spin_lock(&encl_mm->encl->mm_lock);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ break;
+ }
+ }
+ spin_unlock(&encl_mm->encl->mm_lock);
+
+ if (tmp == encl_mm) {
+ synchronize_srcu(&encl_mm->encl->srcu);
+ mmu_notifier_put(mn);
+ }
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
+ kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+ .free_notifier = sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = NULL;
+ struct sgx_encl_mm *tmp;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+ if (tmp->mm == mm) {
+ encl_mm = tmp;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm;
+ int ret;
+
+ /*
+ * Even though a single enclave may be mapped into an mm more than once,
+ * each 'mm' only appears once on encl->mm_list. This is guaranteed by
+ * holding the mm's mmap lock for write before an mm can be added or
+ * remove to an encl->mm_list.
+ */
+ mmap_assert_write_locked(mm);
+
+ /*
+ * It's possible that an entry already exists in the mm_list, because it
+ * is removed only on VFS release or process exit.
+ */
+ if (sgx_encl_find_mm(encl, mm))
+ return 0;
+
+ encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+ if (!encl_mm)
+ return -ENOMEM;
+
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
+ encl_mm->encl = encl;
+ encl_mm->mm = mm;
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
+
+ spin_lock(&encl->mm_lock);
+ list_add_rcu(&encl_mm->list, &encl->mm_list);
+ /* Pairs with smp_rmb() in sgx_reclaimer_block(). */
+ smp_wmb();
+ encl->mm_list_version++;
+ spin_unlock(&encl->mm_lock);
+
+ return 0;
+}
+
+static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
+ pgoff_t index)
+{
+ struct inode *inode = encl->backing->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+ return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+/**
+ * sgx_encl_get_backing() - Pin the backing storage
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Pin the backing storage pages for storing the encrypted contents and Paging
+ * Crypto MetaData (PCMD) of an enclave page.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+ struct page *contents;
+ struct page *pcmd;
+
+ contents = sgx_encl_get_backing_page(encl, page_index);
+ if (IS_ERR(contents))
+ return PTR_ERR(contents);
+
+ pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+ if (IS_ERR(pcmd)) {
+ put_page(contents);
+ return PTR_ERR(pcmd);
+ }
+
+ backing->page_index = page_index;
+ backing->contents = contents;
+ backing->pcmd = pcmd;
+ backing->pcmd_offset =
+ (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
+ sizeof(struct sgx_pcmd);
+
+ return 0;
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ * @do_write: mark pages dirty
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write)
+{
+ if (do_write) {
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
+ }
+
+ put_page(backing->pcmd);
+ put_page(backing->contents);
+}
+
+static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
+ void *data)
+{
+ pte_t pte;
+ int ret;
+
+ ret = pte_young(*ptep);
+ if (ret) {
+ pte = pte_mkold(*ptep);
+ set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+ }
+
+ return ret;
+}
+
+/**
+ * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
+ * @mm: mm_struct that is checked
+ * @page: enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the enclave page and
+ * clears it.
+ *
+ * Return: 1 if the page has been recently accessed and 0 if not.
+ */
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page)
+{
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = sgx_encl_find(mm, addr, &vma);
+ if (ret)
+ return 0;
+
+ if (encl != vma->vm_private_data)
+ return 0;
+
+ ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+ sgx_encl_test_and_clear_young_cb, vma->vm_mm);
+ if (ret < 0)
+ return 0;
+
+ return ret;
+}
+
+/**
+ * sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ *
+ * Allocate a free EPC page and convert it to a Version Array (VA) page.
+ *
+ * Return:
+ * a VA page,
+ * -errno otherwise
+ */
+struct sgx_epc_page *sgx_alloc_va_page(void)
+{
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(NULL, true);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ ret = __epa(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
+ sgx_free_epc_page(epc_page);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return epc_page;
+}
+
+/**
+ * sgx_alloc_va_slot - allocate a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Allocates a slot from a &struct sgx_va_page instance.
+ *
+ * Return: offset of the slot inside the VA page
+ */
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ if (slot < SGX_VA_SLOT_COUNT)
+ set_bit(slot, va_page->slots);
+
+ return slot << 3;
+}
+
+/**
+ * sgx_free_va_slot - free a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ * @offset: offset of the slot inside the VA page
+ *
+ * Frees a slot from a &struct sgx_va_page instance.
+ */
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
+{
+ clear_bit(offset >> 3, va_page->slots);
+}
+
+/**
+ * sgx_va_page_full - is the VA page full?
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Return: true if all slots have been taken
+ */
+bool sgx_va_page_full(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ return slot == SGX_VA_SLOT_COUNT;
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..d8d30ccbef4c
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains the software defined data structures for enclaves.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/* 'desc' bits holding the offset in the VA (version array) page. */
+#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3)
+
+/* 'desc' bit marking that the page is being reclaimed. */
+#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3)
+
+struct sgx_encl_page {
+ unsigned long desc;
+ unsigned long vm_max_prot_bits;
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl *encl;
+ struct sgx_va_page *va_page;
+};
+
+enum sgx_encl_flags {
+ SGX_ENCL_IOCTL = BIT(0),
+ SGX_ENCL_DEBUG = BIT(1),
+ SGX_ENCL_CREATED = BIT(2),
+ SGX_ENCL_INITIALIZED = BIT(3),
+};
+
+struct sgx_encl_mm {
+ struct sgx_encl *encl;
+ struct mm_struct *mm;
+ struct list_head list;
+ struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+ unsigned long base;
+ unsigned long size;
+ unsigned long flags;
+ unsigned int page_cnt;
+ unsigned int secs_child_cnt;
+ struct mutex lock;
+ struct xarray page_array;
+ struct sgx_encl_page secs;
+ unsigned long attributes;
+ unsigned long attributes_mask;
+
+ cpumask_t cpumask;
+ struct file *backing;
+ struct kref refcount;
+ struct list_head va_pages;
+ unsigned long mm_list_version;
+ struct list_head mm_list;
+ spinlock_t mm_lock;
+ struct srcu_struct srcu;
+};
+
+#define SGX_VA_SLOT_COUNT 512
+
+struct sgx_va_page {
+ struct sgx_epc_page *epc_page;
+ DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT);
+ struct list_head list;
+};
+
+struct sgx_backing {
+ pgoff_t page_index;
+ struct page *contents;
+ struct page *pcmd;
+ unsigned long pcmd_offset;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **vma)
+{
+ struct vm_area_struct *result;
+
+ result = find_vma(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+ return -EINVAL;
+
+ *vma = result;
+
+ return 0;
+}
+
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, unsigned long vm_flags);
+
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write);
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page);
+
+struct sgx_epc_page *sgx_alloc_va_page(void);
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
+bool sgx_va_page_full(struct sgx_va_page *va_page);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
new file mode 100644
index 000000000000..443188fe7e70
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_ENCLS_H
+#define _X86_ENCLS_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+};
+
+/**
+ * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define ENCLS_FAULT_FLAG 0x40000000
+
+/* Retrieve the encoded trapnr from the specified return code. */
+#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
+
+/* Issue a WARN() about an ENCLS function. */
+#define ENCLS_WARN(r, name) { \
+ do { \
+ int _r = (r); \
+ WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
+ } while (0); \
+}
+
+/**
+ * encls_failed() - Check if an ENCLS function failed
+ * @ret: the return value of an ENCLS function call
+ *
+ * Check if an ENCLS function failed. This happens when the function causes a
+ * fault that is not caused by an EPCM conflict or when the function returns a
+ * non-zero value.
+ */
+static inline bool encls_failed(int ret)
+{
+ if (ret & ENCLS_FAULT_FLAG)
+ return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
+
+ return !!ret;
+}
+
+/**
+ * __encls_ret_N - encode an ENCLS function that returns an error code in EAX
+ * @rax: function number
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE.
+ * And because SGX isn't complex enough as it is, function that return an error
+ * code also modify flags.
+ *
+ * Return:
+ * 0 on success,
+ * SGX error code on failure
+ */
+#define __encls_ret_N(rax, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : "=a"(ret) \
+ : "a"(rax), inputs \
+ : "memory", "cc"); \
+ ret; \
+ })
+
+#define __encls_ret_1(rax, rcx) \
+ ({ \
+ __encls_ret_N(rax, "c"(rcx)); \
+ })
+
+#define __encls_ret_2(rax, rbx, rcx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_ret_3(rax, rbx, rcx, rdx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \
+ })
+
+/**
+ * __encls_N - encode an ENCLS function that doesn't return an error code
+ * @rax: function number
+ * @rbx_out: optional output variable
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that does not return an error code, e.g.
+ * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an
+ * optional parameter for use by EDGBRD, which returns the requested value in
+ * RBX.
+ *
+ * Return:
+ * 0 on success,
+ * trapnr with ENCLS_FAULT_FLAG set on fault
+ */
+#define __encls_N(rax, rbx_out, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \
+ " xor %%eax,%%eax;\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : "=a"(ret), "=b"(rbx_out) \
+ : "a"(rax), inputs \
+ : "memory"); \
+ ret; \
+ })
+
+#define __encls_2(rax, rbx, rcx) \
+ ({ \
+ unsigned long ign_rbx_out; \
+ __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_1_1(rax, data, rcx) \
+ ({ \
+ unsigned long rbx_out; \
+ int ret = __encls_N(rax, rbx_out, "c"(rcx)); \
+ if (!ret) \
+ data = rbx_out; \
+ ret; \
+ })
+
+static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
+{
+ return __encls_2(ECREATE, pginfo, secs);
+}
+
+static inline int __eextend(void *secs, void *addr)
+{
+ return __encls_2(EEXTEND, secs, addr);
+}
+
+static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EADD, pginfo, addr);
+}
+
+static inline int __einit(void *sigstruct, void *token, void *secs)
+{
+ return __encls_ret_3(EINIT, sigstruct, secs, token);
+}
+
+static inline int __eremove(void *addr)
+{
+ return __encls_ret_1(EREMOVE, addr);
+}
+
+static inline int __edbgwr(void *addr, unsigned long *data)
+{
+ return __encls_2(EDGBWR, *data, addr);
+}
+
+static inline int __edbgrd(void *addr, unsigned long *data)
+{
+ return __encls_1_1(EDGBRD, *data, addr);
+}
+
+static inline int __etrack(void *addr)
+{
+ return __encls_ret_1(ETRACK, addr);
+}
+
+static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(ELDU, pginfo, addr, va);
+}
+
+static inline int __eblock(void *addr)
+{
+ return __encls_ret_1(EBLOCK, addr);
+}
+
+static inline int __epa(void *addr)
+{
+ unsigned long rbx = SGX_PAGE_TYPE_VA;
+
+ return __encls_2(EPA, rbx, addr);
+}
+
+static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(EWB, pginfo, addr, va);
+}
+
+#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..90a5caf76939
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <asm/mman.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
+{
+ struct sgx_va_page *va_page = NULL;
+ void *err;
+
+ BUILD_BUG_ON(SGX_VA_SLOT_COUNT !=
+ (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1);
+
+ if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) {
+ va_page = kzalloc(sizeof(*va_page), GFP_KERNEL);
+ if (!va_page)
+ return ERR_PTR(-ENOMEM);
+
+ va_page->epc_page = sgx_alloc_va_page();
+ if (IS_ERR(va_page->epc_page)) {
+ err = ERR_CAST(va_page->epc_page);
+ kfree(va_page);
+ return err;
+ }
+
+ WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT);
+ }
+ encl->page_cnt++;
+ return va_page;
+}
+
+static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+{
+ encl->page_cnt--;
+
+ if (va_page) {
+ sgx_free_epc_page(va_page->epc_page);
+ list_del(&va_page->list);
+ kfree(va_page);
+ }
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+ struct sgx_epc_page *secs_epc;
+ struct sgx_va_page *va_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_secinfo secinfo;
+ unsigned long encl_size;
+ struct file *backing;
+ long ret;
+
+ va_page = sgx_encl_grow(encl);
+ if (IS_ERR(va_page))
+ return PTR_ERR(va_page);
+ else if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+ /* else the tail page of the VA page list had free slots. */
+
+ /* The extra page goes to SECS. */
+ encl_size = secs->size + PAGE_SIZE;
+
+ backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+ VM_NORESERVE);
+ if (IS_ERR(backing)) {
+ ret = PTR_ERR(backing);
+ goto err_out_shrink;
+ }
+
+ encl->backing = backing;
+
+ secs_epc = sgx_alloc_epc_page(&encl->secs, true);
+ if (IS_ERR(secs_epc)) {
+ ret = PTR_ERR(secs_epc);
+ goto err_out_backing;
+ }
+
+ encl->secs.epc_page = secs_epc;
+
+ pginfo.addr = 0;
+ pginfo.contents = (unsigned long)secs;
+ pginfo.metadata = (unsigned long)&secinfo;
+ pginfo.secs = 0;
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc));
+ if (ret) {
+ ret = -EIO;
+ goto err_out;
+ }
+
+ if (secs->attributes & SGX_ATTR_DEBUG)
+ set_bit(SGX_ENCL_DEBUG, &encl->flags);
+
+ encl->secs.encl = encl;
+ encl->base = secs->base;
+ encl->size = secs->size;
+ encl->attributes = secs->attributes;
+ encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
+
+ /* Set only after completion, as encl->lock has not been taken. */
+ set_bit(SGX_ENCL_CREATED, &encl->flags);
+
+ return 0;
+
+err_out:
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+err_out_backing:
+ fput(encl->backing);
+ encl->backing = NULL;
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @encl: An enclave pointer.
+ * @arg: The ioctl argument.
+ *
+ * Allocate kernel data structures for the enclave and invoke ECREATE.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EIO: ECREATE failed.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_create create_arg;
+ void *secs;
+ int ret;
+
+ if (test_bit(SGX_ENCL_CREATED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&create_arg, arg, sizeof(create_arg)))
+ return -EFAULT;
+
+ secs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!secs)
+ return -ENOMEM;
+
+ if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
+ ret = -EFAULT;
+ else
+ ret = sgx_encl_create(encl, secs);
+
+ kfree(secs);
+ return ret;
+}
+
+static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+ u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+ u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+ if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+ return -EINVAL;
+
+ if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+ return -EINVAL;
+
+ /*
+ * CPU will silently overwrite the permissions as zero, which means
+ * that we need to validate it ourselves.
+ */
+ if (pt == SGX_SECINFO_TCS && perm)
+ return -EINVAL;
+
+ if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+ return -EINVAL;
+
+ if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+ struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_secinfo *secinfo, unsigned long src)
+{
+ struct sgx_pageinfo pginfo;
+ struct vm_area_struct *vma;
+ struct page *src_page;
+ int ret;
+
+ /* Deny noexec. */
+ vma = find_vma(current->mm, src);
+ if (!vma)
+ return -EFAULT;
+
+ if (!(vma->vm_flags & VM_MAYEXEC))
+ return -EACCES;
+
+ ret = get_user_pages(src, 1, 0, &src_page, NULL);
+ if (ret < 1)
+ return -EFAULT;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = (unsigned long)secinfo;
+ pginfo.contents = (unsigned long)kmap_atomic(src_page);
+
+ ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
+
+ kunmap_atomic((void *)pginfo.contents);
+ put_page(src_page);
+
+ return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+ struct sgx_epc_page *epc_page)
+{
+ unsigned long offset;
+ int ret;
+
+ for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) {
+ ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
+ sgx_get_epc_virt_addr(epc_page) + offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EEXTEND");
+
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+ unsigned long offset, struct sgx_secinfo *secinfo,
+ unsigned long flags)
+{
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ int ret;
+
+ encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+ if (IS_ERR(encl_page))
+ return PTR_ERR(encl_page);
+
+ epc_page = sgx_alloc_epc_page(encl_page, true);
+ if (IS_ERR(epc_page)) {
+ kfree(encl_page);
+ return PTR_ERR(epc_page);
+ }
+
+ va_page = sgx_encl_grow(encl);
+ if (IS_ERR(va_page)) {
+ ret = PTR_ERR(va_page);
+ goto err_out_free;
+ }
+
+ mmap_read_lock(current->mm);
+ mutex_lock(&encl->lock);
+
+ /*
+ * Adding to encl->va_pages must be done under encl->lock. Ditto for
+ * deleting (via sgx_encl_shrink()) in the error path.
+ */
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ /*
+ * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e.
+ * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+ * to userspace errors (or kernel/hardware bugs).
+ */
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ if (ret)
+ goto err_out_unlock;
+
+ ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+ src);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Complete the "add" before doing the "extend" so that the "add"
+ * isn't in a half-baked state in the extremely unlikely scenario
+ * the enclave will be destroyed in response to EEXTEND failure.
+ */
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl->secs_child_cnt++;
+
+ if (flags & SGX_PAGE_MEASURE) {
+ ret = __sgx_encl_extend(encl, epc_page);
+ if (ret)
+ goto err_out;
+ }
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+ return ret;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+ sgx_encl_shrink(encl, va_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+
+err_out_free:
+ sgx_free_epc_page(epc_page);
+ kfree(encl_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl: an enclave pointer
+ * @arg: a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * The function deinitializes kernel data structures for enclave and returns
+ * -EIO in any of the following conditions:
+ *
+ * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
+ * been invalidated. This will cause EADD and EEXTEND to fail.
+ * - If the source address is corrupted somehow when executing EADD.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EACCES: The source page is located in a noexec partition.
+ * - -ENOMEM: Out of EPC pages.
+ * - -EINTR: The call was interrupted before data was processed.
+ * - -EIO: Either EADD or EEXTEND failed because invalid source address
+ * or power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_add_pages add_arg;
+ struct sgx_secinfo secinfo;
+ unsigned long c;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) ||
+ !IS_ALIGNED(add_arg.src, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1))
+ return -EINVAL;
+
+ if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
+ sizeof(secinfo)))
+ return -EFAULT;
+
+ if (sgx_validate_secinfo(&secinfo))
+ return -EINVAL;
+
+ for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) {
+ if (signal_pending(current)) {
+ if (!c)
+ ret = -ERESTARTSYS;
+
+ break;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+ ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
+ &secinfo, add_arg.flags);
+ if (ret)
+ break;
+ }
+
+ add_arg.count = c;
+
+ if (copy_to_user(arg, &add_arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
+ void *hash)
+{
+ SHASH_DESC_ON_STACK(shash, tfm);
+
+ shash->tfm = tfm;
+
+ return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
+}
+
+static int sgx_get_key_hash(const void *modulus, void *hash)
+{
+ struct crypto_shash *tfm;
+ int ret;
+
+ tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ ret = __sgx_get_key_hash(tfm, modulus, hash);
+
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+ void *token)
+{
+ u64 mrsigner[4];
+ int i, j, k;
+ void *addr;
+ int ret;
+
+ /*
+ * Deny initializing enclaves with attributes (namely provisioning)
+ * that have not been explicitly allowed.
+ */
+ if (encl->attributes & ~encl->attributes_mask)
+ return -EACCES;
+
+ /*
+ * Attributes should not be enforced *only* against what's available on
+ * platform (done in sgx_encl_create) but checked and enforced against
+ * the mask for enforcement in sigstruct. For example an enclave could
+ * opt to sign with AVX bit in xfrm, but still be loadable on a platform
+ * without it if the sigstruct->body.attributes_mask does not turn that
+ * bit on.
+ */
+ if (sigstruct->body.attributes & sigstruct->body.attributes_mask &
+ sgx_attributes_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.miscselect & sigstruct->body.misc_mask &
+ sgx_misc_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask &
+ sgx_xfrm_reserved_mask)
+ return -EINVAL;
+
+ ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
+ if (ret)
+ return ret;
+
+ mutex_lock(&encl->lock);
+
+ /*
+ * ENCLS[EINIT] is interruptible because it has such a high latency,
+ * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending,
+ * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be
+ * serviced.
+ */
+ for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+ for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+ addr = sgx_get_epc_virt_addr(encl->secs.epc_page);
+
+ preempt_disable();
+
+ for (k = 0; k < 4; k++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+
+ ret = __einit(sigstruct, token, addr);
+
+ preempt_enable();
+
+ if (ret == SGX_UNMASKED_EVENT)
+ continue;
+ else
+ break;
+ }
+
+ if (ret != SGX_UNMASKED_EVENT)
+ break;
+
+ msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto err_out;
+ }
+ }
+
+ if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EINIT");
+
+ ret = -EIO;
+ } else if (ret) {
+ pr_debug("EINIT returned %d\n", ret);
+ ret = -EPERM;
+ } else {
+ set_bit(SGX_ENCL_INITIALIZED, &encl->flags);
+ }
+
+err_out:
+ mutex_unlock(&encl->lock);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT. The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EPERM: Invalid SIGSTRUCT.
+ * - -EIO: EINIT failed because of a power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_sigstruct *sigstruct;
+ struct sgx_enclave_init init_arg;
+ struct page *initp_page;
+ void *token;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
+ return -EFAULT;
+
+ initp_page = alloc_page(GFP_KERNEL);
+ if (!initp_page)
+ return -ENOMEM;
+
+ sigstruct = kmap(initp_page);
+ token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
+ memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
+
+ if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct,
+ sizeof(*sigstruct))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * A legacy field used with Intel signed enclaves. These used to mean
+ * regular and architectural enclaves. The CPU only accepts these values
+ * but they do not have any other meaning.
+ *
+ * Thus, reject any other values.
+ */
+ if (sigstruct->header.vendor != 0x0000 &&
+ sigstruct->header.vendor != 0x8086) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sgx_encl_init(encl, sigstruct, token);
+
+out:
+ kunmap(initp_page);
+ __free_page(initp_page);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_provision instance
+ *
+ * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to
+ * /dev/sgx_provision.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_provision params;
+ struct file *file;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ file = fget(params.fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct sgx_encl *encl = filep->private_data;
+ int ret;
+
+ if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags))
+ return -EBUSY;
+
+ switch (cmd) {
+ case SGX_IOC_ENCLAVE_CREATE:
+ ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_ADD_PAGES:
+ ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_INIT:
+ ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_PROVISION:
+ ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ clear_bit(SGX_ENCL_IOCTL, &encl->flags);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..8df81a3ed945
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+static int sgx_nr_epc_sections;
+static struct task_struct *ksgxd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+
+/*
+ * These variables are part of the state of the reclaimer, and must be accessed
+ * with sgx_reclaimer_lock acquired.
+ */
+static LIST_HEAD(sgx_active_page_list);
+
+static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+
+/*
+ * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
+ * pages whose child pages blocked EREMOVE.
+ */
+static void sgx_sanitize_section(struct sgx_epc_section *section)
+{
+ struct sgx_epc_page *page;
+ LIST_HEAD(dirty);
+ int ret;
+
+ /* init_laundry_list is thread-local, no need for a lock: */
+ while (!list_empty(&section->init_laundry_list)) {
+ if (kthread_should_stop())
+ return;
+
+ /* needed for access to ->page_list: */
+ spin_lock(&section->lock);
+
+ page = list_first_entry(&section->init_laundry_list,
+ struct sgx_epc_page, list);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (!ret)
+ list_move(&page->list, &section->page_list);
+ else
+ list_move_tail(&page->list, &dirty);
+
+ spin_unlock(&section->lock);
+
+ cond_resched();
+ }
+
+ list_splice(&dirty, &section->init_laundry_list);
+}
+
+static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ struct sgx_encl *encl = page->encl;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = true;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+ ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ if (!ret)
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+
+ mutex_lock(&encl->lock);
+
+ ret = __eblock(sgx_get_epc_virt_addr(epc_page));
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EBLOCK");
+
+ mutex_unlock(&encl->lock);
+}
+
+static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
+ struct sgx_backing *backing)
+{
+ struct sgx_pageinfo pginfo;
+ int ret;
+
+ pginfo.addr = 0;
+ pginfo.secs = 0;
+
+ pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
+ backing->pcmd_offset;
+
+ ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+
+ kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
+ backing->pcmd_offset));
+ kunmap_atomic((void *)(unsigned long)pginfo.contents);
+
+ return ret;
+}
+
+static void sgx_ipi_cb(void *info)
+{
+}
+
+static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * Can race with sgx_encl_mm_add(), but ETRACK has already been
+ * executed, which means that the CPUs running in the new mm will enter
+ * into the enclave with a fresh epoch.
+ */
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
+/*
+ * Swap page to the regular memory transformed to the blocked state by using
+ * EBLOCK, which means that it can no loger be referenced (no new TLB entries).
+ *
+ * The first trial just tries to write the page assuming that some other thread
+ * has reset the count for threads inside the enlave by using ETRACK, and
+ * previous thread count has been zeroed out. The second trial calls ETRACK
+ * before EWB. If that fails we kick all the HW threads out, and then do EWB,
+ * which should be guaranteed the succeed.
+ */
+static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_va_page *va_page;
+ unsigned int va_offset;
+ void *va_slot;
+ int ret;
+
+ encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
+
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ va_offset = sgx_alloc_va_slot(va_page);
+ va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
+ if (sgx_va_page_full(va_page))
+ list_move_tail(&va_page->list, &encl->va_pages);
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ETRACK");
+ }
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ /*
+ * Slow path, send IPIs to kick cpus out of the
+ * enclave. Note, it's imperative that the cpu
+ * mask is generated *after* ETRACK, else we'll
+ * miss cpus that entered the enclave between
+ * generating the mask and incrementing epoch.
+ */
+ on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
+ sgx_ipi_cb, NULL, 1);
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ }
+ }
+
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EWB");
+
+ sgx_free_va_slot(va_page, va_offset);
+ } else {
+ encl_page->desc |= va_offset;
+ encl_page->va_page = va_page;
+ }
+}
+
+static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_backing secs_backing;
+ int ret;
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_ewb(epc_page, backing);
+ encl_page->epc_page = NULL;
+ encl->secs_child_cnt--;
+
+ if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
+ ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
+ &secs_backing);
+ if (ret)
+ goto out;
+
+ sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
+
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+ sgx_encl_put_backing(&secs_backing, true);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+}
+
+/*
+ * Take a fixed number of pages from the head of the active page pool and
+ * reclaim them to the enclave's private shmem files. Skip the pages, which have
+ * been accessed since the last scan. Move those pages to the tail of active
+ * page pool so that the pages get scanned in LRU like fashion.
+ *
+ * Batch process a chunk of pages (at the moment 16) in order to degrade amount
+ * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
+ * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
+ * + EWB) but not sufficiently. Reclaiming one page at a time would also be
+ * problematic as it would increase the lock contention too much, which would
+ * halt forward progress.
+ */
+static void sgx_reclaim_pages(void)
+{
+ struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
+ struct sgx_backing backing[SGX_NR_TO_SCAN];
+ struct sgx_epc_section *section;
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ pgoff_t page_index;
+ int cnt = 0;
+ int ret;
+ int i;
+
+ spin_lock(&sgx_reclaimer_lock);
+ for (i = 0; i < SGX_NR_TO_SCAN; i++) {
+ if (list_empty(&sgx_active_page_list))
+ break;
+
+ epc_page = list_first_entry(&sgx_active_page_list,
+ struct sgx_epc_page, list);
+ list_del_init(&epc_page->list);
+ encl_page = epc_page->owner;
+
+ if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
+ chunk[cnt++] = epc_page;
+ else
+ /* The owner is freeing the page. No need to add the
+ * page back to the list of reclaimable pages.
+ */
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ encl_page = epc_page->owner;
+
+ if (!sgx_reclaimer_age(epc_page))
+ goto skip;
+
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret)
+ goto skip;
+
+ mutex_lock(&encl_page->encl->lock);
+ encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
+ mutex_unlock(&encl_page->encl->lock);
+ continue;
+
+skip:
+ spin_lock(&sgx_reclaimer_lock);
+ list_add_tail(&epc_page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+
+ chunk[i] = NULL;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (epc_page)
+ sgx_reclaimer_block(epc_page);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (!epc_page)
+ continue;
+
+ encl_page = epc_page->owner;
+ sgx_reclaimer_write(epc_page, &backing[i]);
+ sgx_encl_put_backing(&backing[i], true);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+
+ section = &sgx_epc_sections[epc_page->section];
+ spin_lock(&section->lock);
+ list_add_tail(&epc_page->list, &section->page_list);
+ section->free_cnt++;
+ spin_unlock(&section->lock);
+ }
+}
+
+static unsigned long sgx_nr_free_pages(void)
+{
+ unsigned long cnt = 0;
+ int i;
+
+ for (i = 0; i < sgx_nr_epc_sections; i++)
+ cnt += sgx_epc_sections[i].free_cnt;
+
+ return cnt;
+}
+
+static bool sgx_should_reclaim(unsigned long watermark)
+{
+ return sgx_nr_free_pages() < watermark &&
+ !list_empty(&sgx_active_page_list);
+}
+
+static int ksgxd(void *p)
+{
+ int i;
+
+ set_freezable();
+
+ /*
+ * Sanitize pages in order to recover from kexec(). The 2nd pass is
+ * required for SECS pages, whose child pages blocked EREMOVE.
+ */
+ for (i = 0; i < sgx_nr_epc_sections; i++)
+ sgx_sanitize_section(&sgx_epc_sections[i]);
+
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ sgx_sanitize_section(&sgx_epc_sections[i]);
+
+ /* Should never happen. */
+ if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
+ WARN(1, "EPC section %d has unsanitized pages.\n", i);
+ }
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_freezable(ksgxd_waitq,
+ kthread_should_stop() ||
+ sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+
+ if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages();
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool __init sgx_page_reclaimer_init(void)
+{
+ struct task_struct *tsk;
+
+ tsk = kthread_run(ksgxd, NULL, "ksgxd");
+ if (IS_ERR(tsk))
+ return false;
+
+ ksgxd_tsk = tsk;
+
+ return true;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+{
+ struct sgx_epc_page *page;
+
+ spin_lock(&section->lock);
+
+ if (list_empty(&section->page_list)) {
+ spin_unlock(&section->lock);
+ return NULL;
+ }
+
+ page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ list_del_init(&page->list);
+ section->free_cnt--;
+
+ spin_unlock(&section->lock);
+ return page;
+}
+
+/**
+ * __sgx_alloc_epc_page() - Allocate an EPC page
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page().
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *__sgx_alloc_epc_page(void)
+{
+ struct sgx_epc_section *section;
+ struct sgx_epc_page *page;
+ int i;
+
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ section = &sgx_epc_sections[i];
+
+ page = __sgx_alloc_epc_page_from_section(section);
+ if (page)
+ return page;
+ }
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * @page: EPC page
+ *
+ * Mark a page as reclaimable and add it to the active page list. Pages
+ * are automatically removed from the active list when freed.
+ */
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ list_add_tail(&page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+}
+
+/**
+ * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
+ * @page: EPC page
+ *
+ * Clear the reclaimable flag and remove the page from the active page list.
+ *
+ * Return:
+ * 0 on success,
+ * -EBUSY if the page is in the process of being reclaimed
+ */
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
+ /* The page is being reclaimed. */
+ if (list_empty(&page->list)) {
+ spin_unlock(&sgx_reclaimer_lock);
+ return -EBUSY;
+ }
+
+ list_del(&page->list);
+ page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_alloc_epc_page() - Allocate an EPC page
+ * @owner: the owner of the EPC page
+ * @reclaim: reclaim pages if necessary
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page(). If
+ * @reclaim is set to true, directly reclaim pages when we are out of pages. No
+ * mm's can be locked when @reclaim is set to true.
+ *
+ * Finally, wake up ksgxd when the number of pages goes below the watermark
+ * before returning back to the caller.
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
+{
+ struct sgx_epc_page *page;
+
+ for ( ; ; ) {
+ page = __sgx_alloc_epc_page();
+ if (!IS_ERR(page)) {
+ page->owner = owner;
+ break;
+ }
+
+ if (list_empty(&sgx_active_page_list))
+ return ERR_PTR(-ENOMEM);
+
+ if (!reclaim) {
+ page = ERR_PTR(-EBUSY);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ page = ERR_PTR(-ERESTARTSYS);
+ break;
+ }
+
+ sgx_reclaim_pages();
+ cond_resched();
+ }
+
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ wake_up(&ksgxd_waitq);
+
+ return page;
+}
+
+/**
+ * sgx_free_epc_page() - Free an EPC page
+ * @page: an EPC page
+ *
+ * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ */
+void sgx_free_epc_page(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
+ return;
+
+ spin_lock(&section->lock);
+ list_add_tail(&page->list, &section->page_list);
+ section->free_cnt++;
+ spin_unlock(&section->lock);
+}
+
+static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
+ unsigned long index,
+ struct sgx_epc_section *section)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long i;
+
+ section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
+ if (!section->virt_addr)
+ return false;
+
+ section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ if (!section->pages) {
+ memunmap(section->virt_addr);
+ return false;
+ }
+
+ section->phys_addr = phys_addr;
+ spin_lock_init(&section->lock);
+ INIT_LIST_HEAD(&section->page_list);
+ INIT_LIST_HEAD(&section->init_laundry_list);
+
+ for (i = 0; i < nr_pages; i++) {
+ section->pages[i].section = index;
+ section->pages[i].flags = 0;
+ section->pages[i].owner = NULL;
+ list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ }
+
+ section->free_cnt = nr_pages;
+ return true;
+}
+
+/**
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+ return (low & GENMASK_ULL(31, 12)) +
+ ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+static bool __init sgx_page_cache_init(void)
+{
+ u32 eax, ebx, ecx, edx, type;
+ u64 pa, size;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+ cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID)
+ break;
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ pr_err_once("Unknown EPC section type: %u\n", type);
+ break;
+ }
+
+ pa = sgx_calc_section_metric(eax, ebx);
+ size = sgx_calc_section_metric(ecx, edx);
+
+ pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+ if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+ pr_err("No free memory for an EPC section\n");
+ break;
+ }
+
+ sgx_nr_epc_sections++;
+ }
+
+ if (!sgx_nr_epc_sections) {
+ pr_err("There are zero EPC sections.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int __init sgx_init(void)
+{
+ int ret;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX))
+ return -ENODEV;
+
+ if (!sgx_page_cache_init())
+ return -ENOMEM;
+
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
+ goto err_page_cache;
+ }
+
+ ret = sgx_drv_init();
+ if (ret)
+ goto err_kthread;
+
+ return 0;
+
+err_kthread:
+ kthread_stop(ksgxd_tsk);
+
+err_page_cache:
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ vfree(sgx_epc_sections[i].pages);
+ memunmap(sgx_epc_sections[i].virt_addr);
+ }
+
+ return ret;
+}
+
+device_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..5fa42d143feb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include "arch.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_EEXTEND_BLOCK_SIZE 256
+#define SGX_NR_TO_SCAN 16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
+/* Pages, which are being tracked by the page reclaimer. */
+#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+
+struct sgx_epc_page {
+ unsigned int section;
+ unsigned int flags;
+ struct sgx_encl_page *owner;
+ struct list_head list;
+};
+
+/*
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ *
+ * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
+ */
+struct sgx_epc_section {
+ unsigned long phys_addr;
+ void *virt_addr;
+ struct sgx_epc_page *pages;
+
+ spinlock_t lock;
+ struct list_head page_list;
+ unsigned long free_cnt;
+
+ /*
+ * Pages which need EREMOVE run on them before they can be
+ * used. Only safe to be accessed in ksgxd and init code.
+ * Not protected by locks.
+ */
+ struct list_head init_laundry_list;
+};
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->phys_addr + index * PAGE_SIZE;
+}
+
+static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->virt_addr + index * PAGE_SIZE;
+}
+
+struct sgx_epc_page *__sgx_alloc_epc_page(void);
+void sgx_free_epc_page(struct sgx_epc_page *page);
+
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+
+#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index d3a0791bc052..8678864ce712 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -25,10 +25,10 @@
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
-#ifdef CONFIG_SMP
unsigned int __max_die_per_package __read_mostly = 1;
EXPORT_SYMBOL(__max_die_per_package);
+#ifdef CONFIG_SMP
/*
* Check if given CPUID extended toplogy "leaf" is implemented
*/
@@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
unsigned int die_select_mask, die_level_siblings;
+ bool die_level_present = false;
int leaf;
leaf = detect_extended_topology_leaf(c);
@@ -126,6 +127,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) {
+ die_level_present = true;
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
@@ -139,8 +141,12 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
ht_mask_width) & core_select_mask;
- c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
- core_plus_mask_width) & die_select_mask;
+
+ if (die_level_present) {
+ c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
+ core_plus_mask_width) & die_select_mask;
+ }
+
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
die_plus_mask_width);
/*
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 924571fe5864..c6ede3b3d302 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -501,12 +501,12 @@ static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
ghcb_rbp_is_valid(ghcb)))
return false;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
- regs->si = ghcb->save.rsi;
- regs->di = ghcb->save.rdi;
- regs->bp = ghcb->save.rbp;
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
return true;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 3492aa36bf09..6f7b8cc1bc9f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
init_completion(&cmd.done);
for (; count; count -= 16) {
- call_single_data_t csd = {
- .func = cpuid_smp_cpuid,
- .info = &cmd,
- };
+ call_single_data_t csd;
+
+ INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
cmd.regs.eax = pos;
cmd.regs.ecx = pos >> 32;
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 33ee47670b99..5fcac46aaf6b 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,8 +13,6 @@
#include <linux/uaccess.h>
-static void *kdump_buf_page;
-
static inline bool is_crashed_pfn_valid(unsigned long pfn)
{
#ifndef CONFIG_X86_PAE
@@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn)
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
+ * Copy a page from "oldmem". For this page, there might be no pte mapped
+ * in the current kernel.
*/
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
{
void *vaddr;
@@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
- vaddr = kmap_atomic_pfn(pfn);
+ vaddr = kmap_local_pfn(pfn);
if (!userbuf) {
- memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr);
+ memcpy(buf, vaddr + offset, csize);
} else {
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Kdump buffer page not"
- " allocated\n");
- kunmap_atomic(vaddr);
- return -EFAULT;
- }
- copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr);
- if (copy_to_user(buf, (kdump_buf_page + offset), csize))
- return -EFAULT;
+ if (copy_to_user(buf, vaddr + offset, csize))
+ csize = -EFAULT;
}
- return csize;
-}
+ kunmap_local(vaddr);
-static int __init kdump_buf_page_init(void)
-{
- int ret = 0;
-
- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
- " page\n");
- ret = -ENOMEM;
- }
-
- return ret;
+ return csize;
}
-arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index ddffd80f5c52..6a4cb71c2498 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -184,31 +184,31 @@ static unsigned int ioapic_id;
struct of_ioapic_type {
u32 out_type;
- u32 trigger;
- u32 polarity;
+ u32 is_level;
+ u32 active_low;
};
static struct of_ioapic_type of_ioapic_type[] =
{
{
- .out_type = IRQ_TYPE_EDGE_RISING,
- .trigger = IOAPIC_EDGE,
- .polarity = 1,
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .is_level = 0,
+ .active_low = 1,
},
{
- .out_type = IRQ_TYPE_LEVEL_LOW,
- .trigger = IOAPIC_LEVEL,
- .polarity = 0,
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .is_level = 1,
+ .active_low = 0,
},
{
- .out_type = IRQ_TYPE_LEVEL_HIGH,
- .trigger = IOAPIC_LEVEL,
- .polarity = 1,
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .is_level = 1,
+ .active_low = 1,
},
{
- .out_type = IRQ_TYPE_EDGE_FALLING,
- .trigger = IOAPIC_EDGE,
- .polarity = 0,
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .is_level = 0,
+ .active_low = 0,
},
};
@@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
return -EINVAL;
it = &of_ioapic_type[type_index];
- ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
tmp.ioapic.pin = fwspec->param[0];
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 25c06b67e7e0..299c20f0a38b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -78,6 +78,9 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (!user_mode(regs))
return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+ /* The user space code from other tasks cannot be accessed. */
+ if (regs != task_pt_regs(current))
+ return -EPERM;
/*
* Make sure userspace isn't trying to trick us into dumping kernel
* memory by pointing the userspace instruction pointer at it.
@@ -85,6 +88,12 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
return -EINVAL;
+ /*
+ * Even if named copy_from_user_nmi() this can be invoked from
+ * other contexts and will not try to resolve a pagefault, which is
+ * the correct thing to do here as this code can be called from any
+ * context.
+ */
return copy_from_user_nmi(buf, (void __user *)src, nbytes);
}
@@ -115,13 +124,19 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
u8 opcodes[OPCODE_BUFSIZE];
unsigned long prologue = regs->ip - PROLOGUE_SIZE;
- if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
- printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
- loglvl, prologue);
- } else {
+ switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
+ case 0:
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
__stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ break;
+ case -EPERM:
+ /* No access to the user space stack of other tasks. Ignore. */
+ break;
+ default:
+ printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
+ loglvl, prologue);
+ break;
}
}
@@ -168,7 +183,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
}
}
-void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eb86a2b831b1..571220ac8bea 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);
-void kernel_fpu_begin(void)
+void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
preempt_disable();
@@ -141,13 +141,14 @@ void kernel_fpu_begin(void)
}
__cpu_invalidate_fpregs_state();
- if (boot_cpu_has(X86_FEATURE_XMM))
+ /* Put sane initial values into the control registers. */
+ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);
- if (boot_cpu_has(X86_FEATURE_FPU))
+ if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
asm volatile ("fninit");
}
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
void kernel_fpu_end(void)
{
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..683749b80ae2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
fx->fop = 0;
fx->rip = 0;
fx->rdp = 0;
- memset(&fx->st_space[0], 0, 128);
+ memset(fx->st_space, 0, sizeof(fx->st_space));
}
/*
* SSE is in init state
*/
if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(&fx->xmm_space[0], 0, 256);
+ memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
/*
* First two features are FPU and SSE, which above we handled
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index ac3d5f22fe64..0d54099c2a3a 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -140,16 +140,27 @@ SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ /* Stack - skipping return address of ftrace_caller */
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
- /* regs go into 4th parameter (but make it NULL) */
- movq $0, %rcx
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Only ops with REGS flag set should have CS register set */
+ movq $0, CS(%rsp)
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
call ftrace_stub
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
restore_mcount_regs
/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 05e117137b45..5e9beb77cafd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -37,7 +37,6 @@
#include <asm/kasan.h>
#include <asm/fixmap.h>
#include <asm/realmode.h>
-#include <asm/desc.h>
#include <asm/extable.h>
#include <asm/trapnr.h>
#include <asm/sev-es.h>
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7eb2a1c87969..04bddaaba8e2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -26,15 +26,6 @@
#include <asm/nospec-branch.h>
#include <asm/fixmap.h>
-#ifdef CONFIG_PARAVIRT_XXL
-#include <asm/asm-offsets.h>
-#include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
-#else
-#define INTERRUPT_RETURN iretq
-#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
-#endif
-
/*
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
@@ -161,6 +152,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ * %rsi carries pointer to realmode data and is callee-clobbered. Save
+ * and restore it.
+ */
+ pushq %rsi
+ movq %rax, %rdi
+ call sev_verify_cbit
+ popq %rsi
+
+ /* Switch to new page-table */
movq %rax, %cr3
/* Ensure I am executing from virtual addresses */
@@ -279,6 +285,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
+#include "sev_verify_cbit.S"
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -524,21 +531,19 @@ SYM_DATA_END(level3_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
- * 512 MB kernel mapping. We spend a full page on this pagetable
- * anyway.
+ * Kernel high mapping.
*
- * The kernel code+data+bss must not be bigger than that.
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
*
- * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
- * If you want to increase this then increase MODULES_VADDR
- * too.)
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
*
- * This table is eventually used by the kernel during normal
- * runtime. Care must be taken to clear out undesired bits
- * later, like _PAGE_RW or _PAGE_GLOBAL in some cases.
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
- KERNEL_IMAGE_SIZE/PMD_SIZE)
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
SYM_DATA_END(level2_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7a50f0b62a70..08651a4e6aa0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,6 +7,7 @@
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -50,7 +51,7 @@ unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
bool hpet_msi_disable;
-#ifdef CONFIG_PCI_MSI
+#ifdef CONFIG_GENERIC_MSI_IRQ
static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
static struct irq_domain *hpet_domain;
#endif
@@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
/*
* HPET MSI Support
*/
-#ifdef CONFIG_PCI_MSI
-
-void hpet_msi_unmask(struct irq_data *data)
+#ifdef CONFIG_GENERIC_MSI_IRQ
+static void hpet_msi_unmask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
@@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data)
hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_mask(struct irq_data *data)
+static void hpet_msi_mask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
@@ -489,12 +489,122 @@ void hpet_msi_mask(struct irq_data *data)
hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
{
hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
}
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .msi_init = hpet_msi_init,
+ .msi_free = hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = hpet_id;
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ if (!parent) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ return NULL;
+ }
+ if (parent != x86_vector_domain)
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
+ return d;
+}
+
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
+
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+ int dev_num)
+{
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.data = hc;
+ info.devid = hpet_dev_id(domain);
+ info.hwirq = dev_num;
+
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+
static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
{
struct hpet_channel *hc = clockevent_to_channel(evt);
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 03aa33b58165..668a4a6533d9 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
+ /*
+ * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+ * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+ */
+#ifdef CONFIG_SMP
+ if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+ sizeof(unsigned long) * nr_cpu_ids))
+ return true;
+#else
+ if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
+ sizeof(pcpu_unit_offsets)))
+ return true;
+#endif
+
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
@@ -293,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
(unsigned long)&per_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
+
+ /*
+ * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+ * will read per-cpu cpu_dr7 before clear dr7 register.
+ */
+ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
+ sizeof(cpu_dr7)))
+ return true;
}
return false;
@@ -491,15 +513,12 @@ static int hw_breakpoint_handler(struct die_args *args)
struct perf_event *bp;
unsigned long *dr6_p;
unsigned long dr6;
+ bool bpx;
/* The DR6 value is pointed by args->err */
dr6_p = (unsigned long *)ERR_PTR(args->err);
dr6 = *dr6_p;
- /* If it's a single step, TRAP bits are random */
- if (dr6 & DR_STEP)
- return NOTIFY_DONE;
-
/* Do an early return if no trap bits are set in DR6 */
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
@@ -509,28 +528,29 @@ static int hw_breakpoint_handler(struct die_args *args)
if (likely(!(dr6 & (DR_TRAP0 << i))))
continue;
+ bp = this_cpu_read(bp_per_reg[i]);
+ if (!bp)
+ continue;
+
+ bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
+
/*
- * The counter may be concurrently released but that can only
- * occur from a call_rcu() path. We can then safely fetch
- * the breakpoint, use its callback, touch its counter
- * while we are in an rcu_read_lock() path.
+ * TF and data breakpoints are traps and can be merged, however
+ * instruction breakpoints are faults and will be raised
+ * separately.
+ *
+ * However DR6 can indicate both TF and instruction
+ * breakpoints. In that case take TF as that has precedence and
+ * delay the instruction breakpoint for the next exception.
*/
- rcu_read_lock();
+ if (bpx && (dr6 & DR_STEP))
+ continue;
- bp = this_cpu_read(bp_per_reg[i]);
/*
* Reset the 'i'th TRAP bit in dr6 to denote completion of
* exception handling
*/
(*dr6_p) &= ~(DR_TRAP0 << i);
- /*
- * bp can be NULL due to lazy debug register switching
- * or due to concurrent perf counter removing.
- */
- if (!bp) {
- rcu_read_unlock();
- break;
- }
perf_bp_event(bp, args->regs);
@@ -538,11 +558,10 @@ static int hw_breakpoint_handler(struct die_args *args)
* Set up resume flag to avoid breakpoint recursion when
* returning back to origin.
*/
- if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ if (bpx)
args->regs->flags |= X86_EFLAGS_RF;
-
- rcu_read_unlock();
}
+
/*
* Further processing in do_debug() is needed for a) user-space
* breakpoints (to generate signals) and b) when the system has
diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
deleted file mode 100644
index 7dfb1e808928..000000000000
--- a/arch/x86/kernel/ima_arch.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright (C) 2018 IBM Corporation
- */
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/ima.h>
-
-extern struct boot_params boot_params;
-
-static enum efi_secureboot_mode get_sb_mode(void)
-{
- efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
- efi_status_t status;
- unsigned long size;
- u8 secboot, setupmode;
-
- size = sizeof(secboot);
-
- if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
- pr_info("ima: secureboot mode unknown, no efi\n");
- return efi_secureboot_mode_unknown;
- }
-
- /* Get variable contents into buffer */
- status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
- NULL, &size, &secboot);
- if (status == EFI_NOT_FOUND) {
- pr_info("ima: secureboot mode disabled\n");
- return efi_secureboot_mode_disabled;
- }
-
- if (status != EFI_SUCCESS) {
- pr_info("ima: secureboot mode unknown\n");
- return efi_secureboot_mode_unknown;
- }
-
- size = sizeof(setupmode);
- status = efi.get_variable(L"SetupMode", &efi_variable_guid,
- NULL, &size, &setupmode);
-
- if (status != EFI_SUCCESS) /* ignore unknown SetupMode */
- setupmode = 0;
-
- if (secboot == 0 || setupmode == 1) {
- pr_info("ima: secureboot mode disabled\n");
- return efi_secureboot_mode_disabled;
- }
-
- pr_info("ima: secureboot mode enabled\n");
- return efi_secureboot_mode_enabled;
-}
-
-bool arch_ima_get_secureboot(void)
-{
- static enum efi_secureboot_mode sb_mode;
- static bool initialized;
-
- if (!initialized && efi_enabled(EFI_BOOT)) {
- sb_mode = boot_params.secure_boot;
-
- if (sb_mode == efi_secureboot_mode_unset)
- sb_mode = get_sb_mode();
- initialized = true;
- }
-
- if (sb_mode == efi_secureboot_mode_enabled)
- return true;
- else
- return false;
-}
-
-/* secureboot arch rules */
-static const char * const sb_arch_rules[] = {
-#if !IS_ENABLED(CONFIG_KEXEC_SIG)
- "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
-#endif /* CONFIG_KEXEC_SIG */
- "measure func=KEXEC_KERNEL_CHECK",
-#if !IS_ENABLED(CONFIG_MODULE_SIG)
- "appraise func=MODULE_CHECK appraise_type=imasig",
-#endif
- "measure func=MODULE_CHECK",
- NULL
-};
-
-const char * const *arch_get_ima_policy(void)
-{
- if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
- if (IS_ENABLED(CONFIG_MODULE_SIG))
- set_module_sig_enforced();
- return sb_arch_rules;
- }
- return NULL;
-}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c5dd50369e2f..d4ad344e80bf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -21,6 +21,7 @@
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/thermal.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -374,3 +375,23 @@ void fixup_irqs(void)
}
}
#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ ack_APIC_irq();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 0db0375235b4..8ef35063964b 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
-
-/*
- * void native_restore_fl(unsigned long flags)
- * %eax/%rdi: flags
- */
-SYM_FUNC_START(native_restore_fl)
- push %_ASM_ARG1
- popf
- ret
-SYM_FUNC_END(native_restore_fl)
-EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 547c7abb39f5..df776cdca327 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to)
NOKPROBE_SYMBOL(synthesize_relcall);
/*
- * Skip the prefixes of the instruction.
- */
-static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
-{
- insn_attr_t attr;
-
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- while (inat_is_legacy_prefix(attr)) {
- insn++;
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- }
-#ifdef CONFIG_X86_64
- if (inat_is_rex_prefix(attr))
- insn++;
-#endif
- return insn;
-}
-NOKPROBE_SYMBOL(skip_prefixes);
-
-/*
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
@@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr)
}
/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int is_IF_modifier(kprobe_opcode_t *insn)
-{
- /* Skip prefixes */
- insn = skip_prefixes(insn);
-
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- return 0;
-}
-
-/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
* addressing mode. Note that since @real will be the final place of copied
@@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
len += JMP32_INSN_SIZE;
- p->ainsn.boostable = true;
+ p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = false;
+ p->ainsn.boostable = 0;
}
return len;
@@ -450,6 +411,67 @@ void free_insn_page(void *page)
module_memfree(page);
}
+static void set_resume_flags(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9d: /* popf/popfd */
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = 1;
+ break;
+ case 0x9c: /* pushfl */
+ p->ainsn.is_pushf = 1;
+ break;
+ case 0xcf: /* iret */
+ p->ainsn.if_modifier = 1;
+ fallthrough;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ p->ainsn.is_call = 1;
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+#endif
+ case 0xff:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+ } else if (((opcode & 0x31) == 0x20) ||
+ ((opcode & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct.
+ */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ }
+ break;
+ }
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
@@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p)
*/
len = prepare_boost(buf, p, &insn);
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = is_IF_modifier(buf);
+ /* Analyze the opcode and set resume flags */
+ set_resume_flags(p, &insn);
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler);
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
*/
static void resume_execution(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
@@ -818,59 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /* Skip prefixes */
- insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
+
+ /* Fixup the contents of top of stack */
+ if (p->ainsn.is_pushf) {
*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
*tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = true;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
+ } else if (p->ainsn.is_call) {
*tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = true;
- goto no_change;
- }
- default:
- break;
}
- regs->ip += orig_ip - copy_ip;
+ if (!p->ainsn.is_abs_ip)
+ regs->ip += orig_ip - copy_ip;
-no_change:
restore_btf();
}
NOKPROBE_SYMBOL(resume_execution);
@@ -937,6 +918,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* So clear it by resetting the current kprobe:
*/
regs->flags &= ~X86_EFLAGS_TF;
+ /*
+ * Since the single step (trap) has been cancelled,
+ * we need to restore BTF here.
+ */
+ restore_btf();
/*
* If the TF flag was set before the kprobe hit,
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 681a4b36e9bb..373e5fa3ce1f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -14,15 +14,21 @@
/* Ftrace callback handler for kprobes -- called under preepmt disabed */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ int bit;
- /* Preempt is disabled by ftrace */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
- return;
+ goto out;
kcb = get_kprobe_ctlblk();
if (kprobe_running()) {
@@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
*/
__this_cpu_write(current_kprobe, NULL);
}
+out:
+ preempt_enable_notrace();
+ ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 041f0b50bc27..08eb23074f92 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret;
}
+static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
+{
+ unsigned char ops;
+
+ for (; addr < eaddr; addr++) {
+ if (get_kernel_nofault(ops, (void *)addr) < 0 ||
+ ops != INT3_INSN_OPCODE)
+ return false;
+ }
+
+ return true;
+}
+
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr)
return 0;
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
- /* Another subsystem puts a breakpoint */
+ /*
+ * In the case of detecting unknown breakpoint, this could be
+ * a padding INT3 between functions. Let's check that all the
+ * rest of the bytes are also INT3.
+ */
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
- return 0;
+ return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
+
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7f57ede3cb8e..5e78e01ca3b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -740,6 +740,11 @@ static void __init kvm_apic_init(void)
#endif
}
+static bool __init kvm_msi_ext_dest_id(void)
+{
+ return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
+}
+
static void __init kvm_init_platform(void)
{
kvmclock_init();
@@ -769,6 +774,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.type = X86_HYPER_KVM,
.init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available,
+ .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
.init.init_platform = kvm_init_platform,
#if defined(CONFIG_AMD_MEM_ENCRYPT)
.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 34b18f6eeb2c..aa593743acf6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -44,7 +44,6 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
-#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae..aa15132228da 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
- tlb_gather_mmu(&tlb, mm, start, end);
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
free_pgd_range(&tlb, start, end, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
#endif
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4ac..5e9a34b5bd74 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
+ case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index c0d409810658..ed8ac6bcbafb 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -99,11 +99,9 @@ static int filter_write(u32 reg)
if (!__ratelimit(&fw_rs))
return 0;
- if (reg == MSR_IA32_ENERGY_PERF_BIAS)
- return 0;
-
- pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to x86@kernel.org.\n",
- reg, current->comm, current->pid);
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n",
+ reg, current->comm, current->pid);
+ pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
return 0;
}
@@ -184,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = security_locked_down(LOCKDOWN_MSR);
if (err)
break;
+
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4bc77aaf1303..bf250a339655 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
- bool irq_state;
+ irqentry_state_t irq_state;
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
@@ -502,14 +502,14 @@ nmi_restart:
this_cpu_write(nmi_dr7, local_db_save());
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(this_cpu_read(nmi_dr7));
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee9..c60222ab8ab9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
- else if (type == PARAVIRT_PATCH(cpu.iret) ||
- type == PARAVIRT_PATCH(cpu.usergs_sysret64))
+ else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
- .cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index ace6e334cb39..abd27ec67397 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
- const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
- const unsigned char cpu_usergs_sysret64[6];
- const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
- 0x48, 0x0f, 0x07 }, // swapgs; sysretq
- .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
- PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
- PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 2e9006c1e240..42e92ec62973 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -4,9 +4,6 @@
#include <linux/string.h>
#include <linux/kallsyms.h>
-
-#define DEBUG 1
-
static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index bb7e1132290b..624703af80a1 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
regs_user->regs = task_pt_regs(current);
regs_user->abi = perf_reg_abi(current);
@@ -123,18 +122,26 @@ int perf_reg_validate(u64 mask)
u64 perf_reg_abi(struct task_struct *task)
{
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
return PERF_SAMPLE_REGS_ABI_32;
else
return PERF_SAMPLE_REGS_ABI_64;
}
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
+ struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
struct pt_regs *user_regs = task_pt_regs(current);
+ if (!in_nmi()) {
+ regs_user->regs = user_regs;
+ regs_user->abi = perf_reg_abi(current);
+ return;
+ }
+
/*
* If we're in an NMI that interrupted task_pt_regs setup, then
* we can't sample user regs at all. This check isn't really
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba4593a913fa..145a7ac0c19a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -685,7 +685,7 @@ void arch_cpu_idle(void)
*/
void __cpuidle default_idle(void)
{
- safe_halt();
+ raw_safe_halt();
}
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
EXPORT_SYMBOL(default_idle);
@@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy)
/*
* AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
* states (local apic timer and TSC stop).
+ *
+ * XXX this function is completely buggered vs RCU and tracing.
*/
static void amd_e400_idle(void)
{
@@ -757,9 +759,9 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be called with
* interrupts disabled.
*/
- local_irq_disable();
+ raw_local_irq_disable();
tick_broadcast_exit();
- local_irq_enable();
+ raw_local_irq_enable();
}
/*
@@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void)
if (!need_resched())
__sti_mwait(0, 0);
else
- local_irq_enable();
+ raw_local_irq_enable();
} else {
- local_irq_enable();
+ raw_local_irq_enable();
}
__current_clr_polling();
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index df342bedea88..ad582f9ac5a6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -511,11 +511,10 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
EXPORT_SYMBOL_GPL(start_thread);
#ifdef CONFIG_COMPAT
-void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
{
start_thread_common(regs, new_ip, new_sp,
- test_thread_flag(TIF_X32)
- ? __USER_CS : __USER32_CS,
+ x32 ? __USER_CS : __USER32_CS,
__USER_DS, __USER_DS);
}
#endif
@@ -641,16 +640,12 @@ void set_personality_64bit(void)
/* inherit personality from parent */
/* Make sure to be in 64bit mode */
- clear_thread_flag(TIF_IA32);
clear_thread_flag(TIF_ADDR32);
- clear_thread_flag(TIF_X32);
/* Pretend that this comes from a 64bit execve */
task_pt_regs(current)->orig_ax = __NR_execve;
current_thread_info()->status &= ~TS_COMPAT;
-
- /* Ensure the corresponding mm is not marked. */
if (current->mm)
- current->mm->context.ia32_compat = 0;
+ current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
@@ -662,10 +657,9 @@ void set_personality_64bit(void)
static void __set_personality_x32(void)
{
#ifdef CONFIG_X86_X32
- clear_thread_flag(TIF_IA32);
- set_thread_flag(TIF_X32);
if (current->mm)
- current->mm->context.ia32_compat = TIF_X32;
+ current->mm->context.flags = 0;
+
current->personality &= ~READ_IMPLIES_EXEC;
/*
* in_32bit_syscall() uses the presence of the x32 syscall bit
@@ -683,10 +677,14 @@ static void __set_personality_x32(void)
static void __set_personality_ia32(void)
{
#ifdef CONFIG_IA32_EMULATION
- set_thread_flag(TIF_IA32);
- clear_thread_flag(TIF_X32);
- if (current->mm)
- current->mm->context.ia32_compat = TIF_IA32;
+ if (current->mm) {
+ /*
+ * uprobes applied to this MM need to know this and
+ * cannot use user_64bit_mode() at that time.
+ */
+ current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
+ }
+
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
task_pt_regs(current)->orig_ax = __NR_ia32_execve;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bedca011459c..87a4143aa7d7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index db115943e8bd..9991c5920aac 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84f581c91db4..740f3bdb3f61 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -119,11 +119,6 @@ EXPORT_SYMBOL(boot_cpu_data);
unsigned int def_to_bigsmp;
-/* For MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
@@ -1054,6 +1049,12 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
+ /*
+ * Needs to run after memblock setup because it needs the physical
+ * memory size.
+ */
+ sev_setup_arch();
+
reserve_bios_regions();
efi_fake_memmap();
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index 5f83ccaab877..cdc04d091242 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
goto fail;
regs->dx = val >> 32;
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Hypervisor CPUID bit
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if ((fn == 1 && !(regs->cx & BIT(31))))
+ /* Hypervisor bit */
+ goto fail;
+ else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
/* Skip over the CPUID two-byte opcode */
regs->ip += 2;
@@ -279,14 +305,14 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0xe4:
case 0xe5:
*exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (u64)insn->immediate.value << 16;
+ *exitinfo |= (u8)insn->immediate.value << 16;
break;
/* OUT immediate opcodes */
case 0xe6:
case 0xe7:
*exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (u64)insn->immediate.value << 16;
+ *exitinfo |= (u8)insn->immediate.value << 16;
break;
/* IN register opcodes */
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 4a96726fbaf8..84c1821819af 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -225,7 +225,7 @@ static inline u64 sev_es_rd_ghcb_msr(void)
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
}
-static inline void sev_es_wr_ghcb_msr(u64 val)
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
{
u32 low, high;
@@ -286,6 +286,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
+ /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
+ if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
+ memcpy(dst, buf, size);
+ return ES_OK;
+ }
+
switch (size) {
case 1:
memcpy(&d1, buf, 1);
@@ -335,6 +341,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
+ /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
+ if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
+ memcpy(buf, src, size);
+ return ES_OK;
+ }
+
switch (size) {
case 1:
if (get_user(d1, s))
@@ -374,8 +386,8 @@ fault:
return ES_EXCEPTION;
}
-static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
{
unsigned long va = (unsigned long)vaddr;
unsigned int level;
@@ -394,15 +406,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
if (user_mode(ctxt->regs))
ctxt->fi.error_code |= X86_PF_USER;
- return false;
+ return ES_EXCEPTION;
}
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
pa |= va & ~page_level_mask(level);
*paddr = pa;
- return true;
+ return ES_OK;
}
/* Include code shared with pre-decompression boot stage */
@@ -731,6 +747,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
{
u64 exit_code, exit_info_1, exit_info_2;
unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
phys_addr_t paddr;
void __user *ref;
@@ -740,11 +757,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
- if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) {
- if (!read)
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
ctxt->fi.error_code |= X86_PF_WRITE;
- return ES_EXCEPTION;
+ return res;
}
exit_info_1 = paddr;
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
new file mode 100644
index 000000000000..ee04941a6546
--- /dev/null
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sev_verify_cbit.S - Code for verification of the C-bit position reported
+ * by the Hypervisor when running with SEV enabled.
+ *
+ * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de)
+ *
+ * sev_verify_cbit() is called before switching to a new long-mode page-table
+ * at boot.
+ *
+ * Verify that the C-bit position is correct by writing a random value to
+ * an encrypted memory location while on the current page-table. Then it
+ * switches to the new page-table to verify the memory content is still the
+ * same. After that it switches back to the current page-table and when the
+ * check succeeded it returns. If the check failed the code invalidates the
+ * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to
+ * make sure no interrupt or exception can get the CPU out of the hlt loop.
+ *
+ * New page-table pointer is expected in %rdi (first parameter)
+ *
+ */
+SYM_FUNC_START(sev_verify_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* First check if a C-bit was detected */
+ movq sme_me_mask(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */
+ movq sev_status(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* Save CR4 in %rsi */
+ movq %cr4, %rsi
+
+ /* Disable Global Pages */
+ movq %rsi, %rdx
+ andq $(~X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /*
+ * Verified that running under SEV - now get a random value using
+ * RDRAND. This instruction is mandatory when running as an SEV guest.
+ *
+ * Don't bail out of the loop if RDRAND returns errors. It is better to
+ * prevent forward progress than to work with a non-random value here.
+ */
+1: rdrand %rdx
+ jnc 1b
+
+ /* Store value to memory and keep it in %rdx */
+ movq %rdx, sev_check_data(%rip)
+
+ /* Backup current %cr3 value to restore it later */
+ movq %cr3, %rcx
+
+ /* Switch to new %cr3 - This might unmap the stack */
+ movq %rdi, %cr3
+
+ /*
+ * Compare value in %rdx with memory location. If C-bit is incorrect
+ * this would read the encrypted data and make the check fail.
+ */
+ cmpq %rdx, sev_check_data(%rip)
+
+ /* Restore old %cr3 */
+ movq %rcx, %cr3
+
+ /* Restore previous CR4 */
+ movq %rsi, %cr4
+
+ /* Check CMPQ result */
+ je 3f
+
+ /*
+ * The check failed, prevent any forward progress to prevent ROP
+ * attacks, invalidate the stack and go into a hlt loop.
+ */
+ xorq %rsp, %rsp
+ subq $0x1000, %rsp
+2: hlt
+ jmp 2b
+3:
+#endif
+ /* Return page-table pointer */
+ movq %rdi, %rax
+ ret
+SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index be0d7d4152ec..ea794a083c44 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-void arch_do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
{
struct ksignal ksig;
- if (get_signal(&ksig)) {
+ if (has_signal && get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a7f3e12cfbdb..a5330ff498f0 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGBUS != 5);
BUILD_BUG_ON(NSIGTRAP != 5);
BUILD_BUG_ON(NSIGCHLD != 6);
- BUILD_BUG_ON(NSIGSYS != 1);
+ BUILD_BUG_ON(NSIGSYS != 2);
/* This is part of the ABI and can never change in size: */
BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
@@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
{
signal_compat_build_tests();
- /* Don't leak in-kernel non-uapi flags to user-space */
- if (oact)
- oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
if (!act)
return;
- /* Don't let flags to be set from userspace */
- act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
if (in_ia32_syscall())
act->sa.sa_flags |= SA_IA32_ABI;
if (in_x32_syscall())
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2e6046..02813a7f3a7c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
+#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -82,6 +83,10 @@
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
+#ifdef CONFIG_ACPI_CPPC_LIB
+#include <acpi/cppc_acpi.h>
+#endif
+
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -148,7 +153,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
-static void init_freq_invariance(bool secondary);
+static void init_freq_invariance(bool secondary, bool cppc_ready);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@@ -186,7 +191,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance(true);
+ init_freq_invariance(true, false);
/*
* Get our bogomips.
@@ -229,6 +234,7 @@ static void notrace start_secondary(void *unused)
#endif
cpu_init_exception_handling();
cpu_init();
+ rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
@@ -747,13 +753,14 @@ static void __init smp_quirk_init_udelay(void)
int
wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
{
+ u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
unsigned long send_status, accept_status = 0;
int maxlvt;
/* Target chip */
/* Boot on the stack */
/* Kick the second */
- apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+ apic_icr_write(APIC_DM_NMI | dm, apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -980,10 +987,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
if (!boot_error) {
enable_start_cpu0 = 1;
*cpu0_nmi_registered = 1;
- if (apic->dest_logical == APIC_DEST_LOGICAL)
- id = cpu0_logical_apicid;
- else
- id = apicid;
+ id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
}
@@ -1340,7 +1344,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
- init_freq_invariance(false);
+ init_freq_invariance(false, false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1829,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled)
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
static bool turbo_disabled(void)
{
@@ -2027,6 +2032,48 @@ out:
return true;
}
+#ifdef CONFIG_ACPI_CPPC_LIB
+static bool amd_set_max_freq_ratio(void)
+{
+ struct cppc_perf_caps perf_caps;
+ u64 highest_perf, nominal_perf;
+ u64 perf_ratio;
+ int rc;
+
+ rc = cppc_get_perf_caps(0, &perf_caps);
+ if (rc) {
+ pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ return false;
+ }
+
+ highest_perf = perf_caps.highest_perf;
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!highest_perf || !nominal_perf) {
+ pr_debug("Could not retrieve highest or nominal performance\n");
+ return false;
+ }
+
+ perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+ /* midpoint between max_boost and max_P */
+ perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
+ if (!perf_ratio) {
+ pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = perf_ratio;
+ arch_set_max_freq_ratio(false);
+
+ return true;
+}
+#else
+static bool amd_set_max_freq_ratio(void)
+{
+ return false;
+}
+#endif
+
static void init_counter_refs(void)
{
u64 aperf, mperf;
@@ -2038,7 +2085,24 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
-static void init_freq_invariance(bool secondary)
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
+{
+ /* Bail out if registered already. */
+ if (freq_invariance_syscore_ops.node.prev)
+ return;
+
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
+
+static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@@ -2054,15 +2118,39 @@ static void init_freq_invariance(bool secondary)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (!cppc_ready) {
+ return;
+ }
+ ret = amd_set_max_freq_ratio();
+ }
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
}
}
+#ifdef CONFIG_ACPI_CPPC_LIB
+static DEFINE_MUTEX(freq_invariance_lock);
+
+void init_freq_invariance_cppc(void)
+{
+ static bool secondary;
+
+ mutex_lock(&freq_invariance_lock);
+
+ init_freq_invariance(secondary, true);
+ secondary = true;
+
+ mutex_unlock(&freq_invariance_lock);
+}
+#endif
+
static void disable_freq_invariance_workfn(struct work_struct *work)
{
static_branch_disable(&arch_scale_freq_key);
@@ -2112,7 +2200,7 @@ error:
schedule_work(&disable_freq_invariance_work);
}
#else
-static inline void init_freq_invariance(bool secondary)
+static inline void init_freq_invariance(bool secondary, bool cppc_ready)
{
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b..9442c4136c38 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
+ const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, NULL);
+ text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+ !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 60d2c3798ba2..0f3c307b37b3 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child)
regs->flags |= X86_EFLAGS_TF;
/*
- * Always set TIF_SINGLESTEP - this guarantees that
- * we single-step system calls etc.. This will also
+ * Always set TIF_SINGLESTEP. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
+ /*
+ * Ensure that a trap is triggered once stepping out of a system
+ * call prior to executing any user instruction.
+ */
+ set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
+
oflags = regs->flags;
/* Set TF on the kernel stack.. */
@@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child)
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+ clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
/* But touch TF only if it was set by us.. */
if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 504fa5425bce..660b78827638 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
- long error;
- error = -EINVAL;
if (off & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
- error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-out:
- return error;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
static void find_start_end(unsigned long addr, unsigned long flags,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 992fb1415c0f..4c09ba110204 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq),
MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
@@ -514,16 +515,10 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (intel_iommu_tboot_noforce)
- return 1;
-
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || dmar_disabled)
pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
-#ifdef CONFIG_SWIOTLB
- swiotlb = 0;
-#endif
no_iommu = 0;
return 1;
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0a2ec801b63f..f5477eab5692 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,6 +25,7 @@
*
* Send feedback to <colpatch@us.ibm.com>
*/
+#include <linux/interrupt.h>
#include <linux/nodemask.h>
#include <linux/export.h>
#include <linux/mmzone.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c70fb34028b..7f5aec758f0e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/vdso.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -117,6 +118,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
die(str, regs, error_code);
+ } else {
+ if (fixup_vdso_exception(regs, trapnr, error_code, 0))
+ return 0;
}
/*
@@ -299,11 +303,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
local_irq_enable();
if (handle_user_split_lock(regs, error_code))
- return;
+ goto out;
do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
error_code, BUS_ADRALN, NULL);
+out:
local_irq_disable();
}
@@ -405,7 +410,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif
- idtentry_enter_nmi(regs);
+ irqentry_nmi_enter(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -550,6 +555,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
+ if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
+ return;
+
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
goto exit;
@@ -651,12 +659,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
irqentry_exit_to_user_mode(regs);
} else {
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
instrumentation_begin();
if (!do_int3(regs))
die("int3", regs, 0);
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
}
@@ -793,19 +802,6 @@ static __always_inline unsigned long debug_read_clear_dr6(void)
set_debugreg(DR6_RESERVED, 6);
dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
- /*
- * Clear the virtual DR6 value, ptrace routines will set bits here for
- * things we want signals for.
- */
- current->thread.virtual_dr6 = 0;
-
- /*
- * The SDM says "The processor clears the BTF flag when it
- * generates a debug exception." Clear TIF_BLOCKSTEP to keep
- * TIF_BLOCKSTEP in sync with the hardware BTF flag.
- */
- clear_thread_flag(TIF_BLOCKSTEP);
-
return dr6;
}
@@ -864,7 +860,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
instrumentation_begin();
/*
@@ -873,6 +869,20 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
*/
WARN_ON_ONCE(user_mode(regs));
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
+ * it for userspace, but we just took a kernel #DB, so re-set
+ * BTF.
+ */
+ unsigned long debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= DEBUGCTLMSR_BTF;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
/*
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
* watchpoint at the same time then that will still be handled.
@@ -907,7 +917,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
regs->flags &= ~X86_EFLAGS_TF;
out:
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(dr7);
}
@@ -925,7 +935,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
/*
* NB: We can't easily clear DR7 here because
- * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
@@ -936,6 +946,22 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
instrumentation_begin();
/*
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
+ *
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
+ * even if it is not the result of PTRACE_SINGLESTEP.
+ */
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
+
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_thread_flag(TIF_BLOCKSTEP);
+
+ /*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
@@ -1031,6 +1057,9 @@ static void math_error(struct pt_regs *regs, int trapnr)
if (!si_code)
goto exit;
+ if (fixup_vdso_exception(regs, trapnr, 0, 0))
+ return;
+
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
exit:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 3fdaa042823d..a2b413394917 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -255,12 +255,13 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
+ insn_byte_t p;
int i;
- for (i = 0; i < insn->prefixes.nbytes; i++) {
+ for_each_insn_prefix(insn, i, p) {
insn_attr_t attr;
- attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ attr = inat_get_opcode_attribute(p);
switch (attr) {
case INAT_MAKE_PREFIX(INAT_PFX_ES):
case INAT_MAKE_PREFIX(INAT_PFX_CS):
@@ -715,6 +716,7 @@ static const struct uprobe_xol_ops push_xol_ops = {
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
+ insn_byte_t p;
int i;
switch (opc1) {
@@ -746,8 +748,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- if (insn->prefixes.bytes[i] == 0x66)
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0x66)
return -ENOTSUPP;
}
@@ -1015,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
if (uprobe_post_sstep_notifier(regs))
ret = NOTIFY_STOP;
+ break;
+
default:
break;
}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 764573de3996..e5a7a10a0164 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
user_access_end();
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
}
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- spinlock_t *ptl;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i;
-
- mmap_write_lock(mm);
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- p4d = p4d_offset(pgd, 0xA0000);
- if (p4d_none_or_clear_bad(p4d))
- goto out;
- pud = pud_offset(p4d, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
-
- if (pmd_trans_huge(*pmd)) {
- vma = find_vma(mm, 0xA0000);
- split_huge_pmd(vma, pmd, 0xA0000);
- }
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
-}
-
-
-
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ char comm[TASK_COMM_LEN];
+
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
+ return -EINVAL;
+ }
+
memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx;
@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs;
vm86->flags = v.flags;
- vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored,
@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk);
preempt_enable();
- if (vm86->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf9e0adb5b7e..efd9e9ea17f2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -454,13 +454,13 @@ SECTIONS
ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
}
-#ifdef CONFIG_X86_32
/*
* The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
*/
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
-#else
+
+#ifdef CONFIG_X86_64
/*
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
@@ -470,18 +470,12 @@ INIT_PER_CPU(gdt_page);
INIT_PER_CPU(fixed_percpu_data);
INIT_PER_CPU(irq_stack_backing_store);
-/*
- * Build-time check on the image size:
- */
-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE");
-
#ifdef CONFIG_SMP
. = ASSERT((fixed_percpu_data == 0),
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a3038d8deb6a..8b395821cb8d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -110,6 +110,7 @@ struct x86_init_ops x86_init __initdata = {
.init_platform = x86_init_noop,
.guest_late_init = x86_init_noop,
.x2apic_available = bool_x86_init_noop,
+ .msi_ext_dest_id = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
.init_after_bootmem = x86_init_noop,
},
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..7ac592664c52 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -100,7 +100,8 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
help
- Provides support for launching Encrypted VMs on AMD processors.
+ Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
+ with Encrypted State (SEV-ES) on AMD processors.
config KVM_MMU_AUDIT
bool "Audit KVM MMU"
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..4bd14ab01323 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,7 +10,8 @@ endif
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+ $(KVM)/dirty_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 06a278b3701d..38172ca627d3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -90,6 +90,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return 0;
}
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+
+ /*
+ * save the feature bitmap to avoid cpuid lookup for every PV
+ * operation
+ */
+ if (best)
+ vcpu->arch.pv_cpuid.features = best->eax;
+}
+
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -124,13 +138,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
- /*
- * save the feature bitmap to avoid cpuid lookup for every PV
- * operation
- */
- if (best)
- vcpu->arch.pv_cpuid.features = best->eax;
-
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
if (best)
@@ -139,6 +146,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
MSR_IA32_MISC_ENABLE_MWAIT);
}
}
+EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
@@ -162,6 +170,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ kvm_update_pv_runtime(vcpu);
+
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -169,6 +179,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+
/* Invoke the vendor callback only after the above state is updated. */
kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
}
@@ -309,7 +321,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
r = -EFAULT;
- if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ if (copy_to_user(entries, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
return 0;
@@ -407,7 +419,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK)
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -672,7 +684,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
edx.split.bit_width_fixed = cap.bit_width_fixed;
- edx.split.reserved = 0;
+ edx.split.anythread_deprecated = 1;
+ edx.split.reserved1 = 0;
+ edx.split.reserved2 = 0;
entry->eax = eax.full;
entry->ebx = cap.events_mask;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index bf8577947ed2..dc921d76e42e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -11,6 +11,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -263,6 +264,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
{
return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0d917eb70319..66a08322988f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2879,6 +2879,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
*reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
(u32)msr_data;
+ if (efer & EFER_LMA)
+ ctxt->mode = X86EMUL_MODE_PROT64;
return X86EMUL_CONTINUE;
}
@@ -4046,6 +4048,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
static int em_movsxd(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = (s32) ctxt->src.val;
@@ -4585,7 +4593,7 @@ static const struct opcode group11[] = {
};
static const struct gprefix pfx_0f_ae_7 = {
- I(SrcMem | ByteOp, em_clflush), N, N, N,
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
};
static const struct group_dual group15 = { {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5c7c4060b45c..922c69dcca4d 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1951,8 +1951,8 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
}
-int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
{
uint16_t evmcs_ver = 0;
struct kvm_cpuid_entry2 cpuid_entries[] = {
@@ -2037,7 +2037,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
* Direct Synthetic timers only make sense with in-kernel
* LAPIC
*/
- if (lapic_in_kernel(vcpu))
+ if (!vcpu || lapic_in_kernel(vcpu))
ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e68c6c2e9649..6d7def2b0aad 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -126,7 +126,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
-int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 99d118ffc67d..814698e5b152 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -40,29 +40,10 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
* check if there is pending interrupt from
* non-APIC source without intack.
*/
-static int kvm_cpu_has_extint(struct kvm_vcpu *v)
-{
- u8 accept = kvm_apic_accept_pic_intr(v);
-
- if (accept) {
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
- return v->kvm->arch.vpic->output;
- } else
- return 0;
-}
-
-/*
- * check if there is injectable interrupt:
- * when virtual interrupt delivery enabled,
- * interrupt from apic will handled by hardware,
- * we don't need to check it here.
- */
-int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
/*
- * FIXME: interrupt.injected represents an interrupt that it's
+ * FIXME: interrupt.injected represents an interrupt whose
* side-effects have already been applied (e.g. bit from IRR
* already moved to ISR). Therefore, it is incorrect to rely
* on interrupt.injected to know if there is a pending
@@ -75,6 +56,23 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
if (kvm_cpu_has_extint(v))
return 1;
@@ -91,20 +89,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr);
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- /*
- * FIXME: interrupt.injected represents an interrupt that it's
- * side-effects have already been applied (e.g. bit from IRR
- * already moved to ISR). Therefore, it is incorrect to rely
- * on interrupt.injected to know if there is a pending
- * interrupt in the user-mode LAPIC.
- * This leads to nVMX/nSVM not be able to distinguish
- * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
- * pending interrupt or should re-inject an injected
- * interrupt.
- */
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.injected;
-
if (kvm_cpu_has_extint(v))
return 1;
@@ -118,16 +102,21 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v)) {
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
- return kvm_pic_read_irq(v->kvm); /* PIC */
- } else
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
}
/*
@@ -135,13 +124,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- int vector;
-
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.nr;
-
- vector = kvm_cpu_get_extint(v);
-
+ int vector = kvm_cpu_get_extint(v);
if (vector != -1)
return vector; /* PIC */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 4aa1c2e00e2a..8a4de3f12820 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -16,8 +16,6 @@
#include <trace/events/kvm.h>
-#include <asm/msidef.h>
-
#include "irq.h"
#include "ioapic.h"
@@ -104,22 +102,19 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
- trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
- (u64)e->msi.address_hi << 32 : 0),
- e->msi.data);
-
- irq->dest_id = (e->msi.address_lo &
- MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
- if (kvm->arch.x2apic_format)
- irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
- irq->vector = (e->msi.data &
- MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
- irq->dest_mode = kvm_lapic_irq_dest_mode(
- !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo));
- irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
- irq->delivery_mode = e->msi.data & 0x700;
- irq->msi_redir_hint = ((e->msi.address_lo
- & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
irq->level = 1;
irq->shorthand = APIC_DEST_NOSHORT;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 105e7859d1f2..43cceadd073e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -674,7 +674,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
return false;
}
- return val & 0x1;
+ return val & KVM_PV_EOI_ENABLED;
}
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
@@ -2465,7 +2465,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!kvm_apic_hw_enabled(apic))
+ if (!kvm_apic_present(vcpu))
return -1;
__apic_update_ppr(apic, &ppr);
@@ -2843,14 +2843,35 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector;
+ int r;
unsigned long pe;
- if (!lapic_in_kernel(vcpu) || !apic->pending_events)
+ if (!lapic_in_kernel(vcpu))
return;
/*
+ * Read pending events before calling the check_events
+ * callback.
+ */
+ pe = smp_load_acquire(&apic->pending_events);
+ if (!pe)
+ return;
+
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ if (r < 0)
+ return;
+ /*
+ * If an event has happened and caused a vmexit,
+ * we know INITs are latched and therefore
+ * we will not incorrectly deliver an APIC
+ * event instead of a vmexit.
+ */
+ }
+
+ /*
* INITs are latched while CPU is in specific states
- * (SMM, VMX non-root mode, SVM with GIF=0).
+ * (SMM, VMX root mode, SVM with GIF=0).
* Because a CPU cannot be in these states immediately
* after it has processed an INIT signal (and thus in
* KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs
@@ -2858,26 +2879,28 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
*/
if (kvm_vcpu_latch_init(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
- if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ if (test_bit(KVM_APIC_SIPI, &pe))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
return;
}
- pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
}
- if (test_bit(KVM_APIC_SIPI, &pe) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
- /* evaluate pending_events before reading the vector */
- smp_rmb();
- sipi_vector = apic->sipi_vector;
- kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (test_bit(KVM_APIC_SIPI, &pe)) {
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
}
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9c4a9c8e43d9..261be1d2032b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,12 +44,19 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-static inline u64 rsvd_bits(int s, int e)
+static __always_inline u64 rsvd_bits(int s, int e)
{
+ BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
+
+ if (__builtin_constant_p(e))
+ BUILD_BUG_ON(e > 63);
+ else
+ e &= 63;
+
if (e < s)
return 0;
- return ((1ULL << (e - s + 1)) - 1) << s;
+ return ((2ULL << (e - s)) - 1) << s;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 17587f496ec7..6d16481aa29d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
{
u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
- gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
& shadow_nonpresent_or_rsvd_mask;
return gpa >> PAGE_SHIFT;
@@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 restore_acc_track_spte(u64 spte)
{
u64 new_spte = spte;
- u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
- & shadow_acc_track_saved_bits_mask;
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift);
+ new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
new_spte |= saved_bits;
return new_spte;
@@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return NULL;
- if (no_dirty_log && slot->dirty_bitmap)
+ if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
return NULL;
return slot;
@@ -856,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
- desc = desc->more;
+ while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
- }
- if (desc->sptes[PTE_LIST_EXT-1]) {
- desc->more = mmu_alloc_pte_list_desc(vcpu);
+
+ if (!desc->more) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc = desc->more;
+ break;
+ }
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
@@ -1287,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
+int kvm_cpu_dirty_log_size(void)
+{
+ if (kvm_x86_ops.cpu_dirty_log_size)
+ return kvm_x86_ops.cpu_dirty_log_size();
+
+ return 0;
+}
+
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
@@ -3483,26 +3493,25 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
*/
-static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes)
+static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
{
struct kvm_shadow_walk_iterator iterator;
- int leaf = vcpu->arch.mmu->root_level;
+ int leaf = -1;
u64 spte;
-
walk_shadow_page_lockless_begin(vcpu);
- for (shadow_walk_init(&iterator, vcpu, addr);
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ *root_level = iterator.level;
shadow_walk_okay(&iterator);
__shadow_walk_next(&iterator, spte)) {
leaf = iterator.level;
spte = mmu_spte_get_lockless(iterator.sptep);
- sptes[leaf - 1] = spte;
+ sptes[leaf] = spte;
if (!is_shadow_present_pte(spte))
break;
-
}
walk_shadow_page_lockless_end(vcpu);
@@ -3510,14 +3519,12 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes)
return leaf;
}
-/* return true if reserved bit is detected on spte. */
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
- u64 sptes[PT64_ROOT_MAX_LEVEL];
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
struct rsvd_bits_validate *rsvd_check;
- int root = vcpu->arch.mmu->root_level;
- int leaf;
- int level;
+ int root, leaf, level;
bool reserved = false;
if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
@@ -3526,35 +3533,45 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
}
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
- leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes);
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
- leaf = get_walk(vcpu, addr, sptes);
+ leaf = get_walk(vcpu, addr, sptes, &root);
+
+ if (unlikely(leaf < 0)) {
+ *sptep = 0ull;
+ return reserved;
+ }
+
+ *sptep = sptes[leaf];
+
+ /*
+ * Skip reserved bits checks on the terminal leaf if it's not a valid
+ * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
+ * design, always have reserved bits set. The purpose of the checks is
+ * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
+ */
+ if (!is_shadow_present_pte(sptes[leaf]))
+ leaf++;
rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
- for (level = root; level >= leaf; level--) {
- if (!is_shadow_present_pte(sptes[level - 1]))
- break;
+ for (level = root; level >= leaf; level--)
/*
* Use a bitwise-OR instead of a logical-OR to aggregate the
* reserved bit and EPT's invalid memtype/XWR checks to avoid
* adding a Jcc in the loop.
*/
- reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) |
- __is_rsvd_bits_set(rsvd_check, sptes[level - 1],
- level);
- }
+ reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) |
+ __is_rsvd_bits_set(rsvd_check, sptes[level], level);
if (reserved) {
pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
__func__, addr);
for (level = root; level >= leaf; level--)
pr_err("------ spte 0x%llx level %d.\n",
- sptes[level - 1], level);
+ sptes[level], level);
}
- *sptep = sptes[leaf - 1];
-
return reserved;
}
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 213699b27b44..e798489b56b5 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -381,6 +381,35 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_tdp_mmu_spte_changed,
+ TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte),
+ TP_ARGS(as_id, gfn, level, old_spte, new_spte),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ /* Level cannot be larger than 5 on x86, so it fits in a u8. */
+ __field(u8, level)
+ /* as_id can only be 0 or 1 x86, so it fits in a u8. */
+ __field(u8, as_id)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = new_spte;
+ __entry->level = level;
+ __entry->as_id = as_id;
+ ),
+
+ TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx",
+ __entry->as_id, __entry->gfn, __entry->level,
+ __entry->old_spte, __entry->new_spte
+ )
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d9c5665a55e9..c51ad544f25b 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -40,8 +40,8 @@ static u64 generation_mmio_spte_mask(u64 gen)
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
- mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
- mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
+ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
@@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
mask |= shadow_mmio_value | access;
mask |= gpa | shadow_nonpresent_or_rsvd_mask;
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
- << shadow_nonpresent_or_rsvd_mask_len;
+ << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
return mask;
}
@@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte)
!spte_can_locklessly_be_made_writable(spte),
"kvm: Writable SPTE is not locklessly dirty-trackable\n");
- WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift),
+ WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
"kvm: Access Tracking saved bit locations are not zero\n");
- spte |= (spte & shadow_acc_track_saved_bits_mask) <<
- shadow_acc_track_saved_bits_shift;
+ spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
spte &= ~shadow_acc_track_mask;
return spte;
@@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte)
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_access_mask = access_mask;
@@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void)
low_phys_bits = boot_cpu_data.x86_phys_bits;
if (boot_cpu_has_bug(X86_BUG_L1TF) &&
!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
- 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
low_phys_bits = boot_cpu_data.x86_cache_bits
- - shadow_nonpresent_or_rsvd_mask_len;
+ - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
shadow_nonpresent_or_rsvd_mask =
rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 4ecf40e0b8fe..2b3a30bd38b0 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -56,11 +56,11 @@
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
/*
- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
* the memslots generation and is derived as follows:
*
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
+ * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -69,18 +69,29 @@
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
-#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
- MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
#define MMIO_SPTE_GEN_HIGH_END 62
+
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
+#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
+
+/* remember to adjust the comment above as well if you change these */
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+
+#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
+#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
+
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
@@ -105,19 +116,19 @@ extern u64 __read_mostly shadow_acc_track_mask;
extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
+/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
* PTEs being access tracked also need to be dirty tracked, so the W bit will be
* restored only when a write is attempted to the page.
*/
-static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
- PT64_EPT_EXECUTABLE_MASK;
-static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
-
-/*
- * The number of high-order 1 bits to use in the mask above.
- */
-static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
* In some cases, we need to preserve the GFN of a non-present or reserved
@@ -228,8 +239,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
- gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
- gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
return gen;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 27e381c9da6c..b56d604809b8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,6 +7,8 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include <trace/events/kvm.h>
+
#ifdef CONFIG_X86_64
static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
@@ -42,14 +44,62 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
}
-#define for_each_tdp_mmu_root(_kvm, _root) \
+static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ if (kvm_mmu_put_root(kvm, root))
+ kvm_tdp_mmu_free_root(kvm, root);
+}
+
+static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
+ struct kvm_mmu_page *root)
+{
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
+ return false;
+
+ kvm_mmu_get_root(kvm, root);
+ return true;
+
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *root)
+{
+ struct kvm_mmu_page *next_root;
+
+ next_root = list_next_entry(root, link);
+ tdp_mmu_put_root(kvm, root);
+ return next_root;
+}
+
+/*
+ * Note: this iterator gets and puts references to the roots it iterates over.
+ * This makes it safe to release the MMU lock and yield within the loop, but
+ * if exiting the loop early, the caller must drop the reference to the most
+ * recent root. (Unless keeping a live reference is desirable.)
+ */
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
+ for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
+ typeof(*_root), link); \
+ tdp_mmu_next_root_valid(_kvm, _root); \
+ _root = tdp_mmu_next_root(_kvm, _root))
+
+#define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
{
struct kvm_mmu_page *sp;
+ if (!kvm->arch.tdp_mmu_enabled)
+ return false;
+ if (WARN_ON(!VALID_PAGE(hpa)))
+ return false;
+
sp = to_shadow_page(hpa);
+ if (WARN_ON(!sp))
+ return false;
return sp->tdp_mmu_page && sp->root_count;
}
@@ -59,7 +109,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
- gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
lockdep_assert_held(&kvm->mmu_lock);
@@ -101,6 +151,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
sp->gfn = gfn;
sp->tdp_mmu_page = true;
+ trace_kvm_mmu_get_page(sp, true);
+
return sp;
}
@@ -178,7 +230,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
if ((!is_writable_pte(old_spte) || pfn_changed) &&
is_writable_pte(new_spte)) {
slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
- mark_page_dirty_in_slot(slot, gfn);
+ mark_page_dirty_in_slot(kvm, slot, gfn);
}
}
@@ -237,6 +289,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (old_spte == new_spte)
return;
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -271,6 +325,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
pt = spte_to_child_pt(old_spte, level);
sp = sptep_to_sp(pt);
+ trace_kvm_mmu_prepare_zap_page(sp);
+
list_del(&sp->link);
if (sp->lpage_disallowed)
@@ -432,24 +488,15 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
struct kvm_mmu_page *root;
bool flush = false;
- for_each_tdp_mmu_root(kvm, root) {
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
+ for_each_tdp_mmu_root_yield_safe(kvm, root)
flush |= zap_gfn_range(kvm, root, start, end, true);
- kvm_mmu_put_root(kvm, root);
- }
-
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush;
flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
@@ -473,11 +520,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
if (unlikely(is_noslot_pfn(pfn))) {
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
- } else
+ } else {
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+ }
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
@@ -602,13 +651,7 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
int ret = 0;
int as_id;
- for_each_tdp_mmu_root(kvm, root) {
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
as_id = kvm_mmu_page_as_id(root);
slots = __kvm_memslots(kvm, as_id);
kvm_for_each_memslot(memslot, slots) {
@@ -630,8 +673,6 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
ret |= handler(kvm, memslot, root, gfn_start,
gfn_end, data);
}
-
- kvm_mmu_put_root(kvm, root);
}
return ret;
@@ -691,6 +732,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
young = 1;
+
+ trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}
return young;
@@ -819,21 +862,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
-
- kvm_mmu_put_root(kvm, root);
}
return spte_set;
@@ -887,21 +922,13 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
-
- kvm_mmu_put_root(kvm, root);
}
return spte_set;
@@ -1010,28 +1037,20 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
-
- kvm_mmu_put_root(kvm, root);
}
return spte_set;
}
/*
- * Clear non-leaf entries (and free associated page tables) which could
- * be replaced by large mappings, for GFNs within the slot.
+ * Clear leaf entries which could be replaced by large mappings, for
+ * GFNs within the slot.
*/
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
@@ -1043,7 +1062,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
if (!is_shadow_present_pte(iter.old_spte) ||
- is_last_spte(iter.old_spte, iter.level))
+ !is_last_spte(iter.old_spte, iter.level))
continue;
pfn = spte_to_pfn(iter.old_spte);
@@ -1070,21 +1089,13 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
- for_each_tdp_mmu_root(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
- /*
- * Take a reference on the root so that it cannot be freed if
- * this thread releases the MMU lock and yields in this loop.
- */
- kvm_mmu_get_root(kvm, root);
-
zap_collapsible_spte_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
-
- kvm_mmu_put_root(kvm, root);
}
}
@@ -1141,16 +1152,19 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
*/
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes)
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
struct tdp_iter iter;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- int leaf = vcpu->arch.mmu->shadow_root_level;
gfn_t gfn = addr >> PAGE_SHIFT;
+ int leaf = -1;
+
+ *root_level = vcpu->arch.mmu->shadow_root_level;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
- sptes[leaf - 1] = iter.old_spte;
+ sptes[leaf] = iter.old_spte;
}
return leaf;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 556e065503f6..cbbdbadd1526 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -44,5 +44,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes);
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level);
+
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 7f0059aa30e1..f472fdb6ae7e 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -84,12 +84,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
} else
/* MTRR mask */
mask |= 0x7ff;
- if (data & mask) {
- kvm_inject_gp(vcpu, 0);
- return false;
- }
- return true;
+ return (data & mask) == 0;
}
EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 8c550999ace0..0ef84d57b72e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -233,7 +233,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
*/
static int avic_update_access_page(struct kvm *kvm, bool activate)
{
- int ret = 0;
+ void __user *ret;
+ int r = 0;
mutex_lock(&kvm->slots_lock);
/*
@@ -249,13 +250,15 @@ static int avic_update_access_page(struct kvm *kvm, bool activate)
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
activate ? PAGE_SIZE : 0);
- if (ret)
+ if (IS_ERR(ret)) {
+ r = PTR_ERR(ret);
goto out;
+ }
kvm->arch.apic_access_page_done = activate;
out:
mutex_unlock(&kvm->slots_lock);
- return ret;
+ return r;
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 9e4c226dbf7d..db30670dd8c4 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -199,6 +199,10 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
if (!nested_svm_vmrun_msrpm(svm)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
@@ -227,6 +231,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
bool vmcb12_lma;
if ((vmcb12->save.efer & EFER_SVME) == 0)
@@ -240,21 +245,13 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
- if (!vmcb12_lma) {
- if (vmcb12->save.cr4 & X86_CR4_PAE) {
- if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
- return false;
- } else {
- if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
- return false;
- }
- } else {
+ if (vmcb12_lma) {
if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
!(vmcb12->save.cr0 & X86_CR0_PE) ||
- (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
+ (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
return false;
}
- if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+ if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
return nested_vmcb_check_controls(&vmcb12->control);
@@ -381,7 +378,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.ds = vmcb12->save.ds;
svm->vmcb->save.gdtr = vmcb12->save.gdtr;
svm->vmcb->save.idtr = vmcb12->save.idtr;
- kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
+ kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
@@ -394,8 +391,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6;
+ svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
svm->vmcb->save.cpl = vmcb12->save.cpl;
}
@@ -595,6 +592,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -660,13 +659,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
kvm_rax_write(&svm->vcpu, hsave->save.rax);
kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.dr7 = DR7_FIXED_1;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
@@ -753,6 +753,7 @@ void svm_leave_nested(struct vcpu_svm *svm)
leave_guest_mode(&svm->vcpu);
copy_vmcb_control_area(&vmcb->control, &hsave->control);
nested_svm_uninit_mmu_context(&svm->vcpu);
+ vmcb_mark_all_dirty(svm->vmcb);
}
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
@@ -1193,6 +1194,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* in the registers, the save area of the nested state instead
* contains saved L1 state.
*/
+
+ svm->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
hsave->save = *save;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c0b14106258a..48017fef1cd9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,10 +14,20 @@
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+#include <asm/fpu/internal.h>
+
+#include <asm/trapnr.h>
#include "x86.h"
#include "svm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+static u8 sev_enc_bit;
static int sev_flush_asids(void);
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
@@ -25,7 +35,6 @@ unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
struct enc_region {
struct list_head list;
@@ -57,19 +66,19 @@ static int sev_flush_asids(void)
}
/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(void)
+static bool __sev_recycle_asids(int min_asid, int max_asid)
{
int pos;
/* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap,
- max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid)
+ pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid);
+ if (pos >= max_asid)
return false;
if (sev_flush_asids())
return false;
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
max_sev_asid);
bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
@@ -77,20 +86,23 @@ static bool __sev_recycle_asids(void)
return true;
}
-static int sev_asid_new(void)
+static int sev_asid_new(struct kvm_sev_info *sev)
{
+ int pos, min_asid, max_asid;
bool retry = true;
- int pos;
mutex_lock(&sev_bitmap_lock);
/*
- * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
*/
+ min_asid = sev->es_active ? 0 : min_sev_asid - 1;
+ max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
- pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid) {
- if (retry && __sev_recycle_asids()) {
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid);
+ if (pos >= max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
retry = false;
goto again;
}
@@ -172,7 +184,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
- asid = sev_asid_new();
+ asid = sev_asid_new(sev);
if (asid < 0)
return ret;
@@ -191,6 +203,16 @@ e_free:
return ret;
}
+static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ if (!sev_es)
+ return -ENOTTY;
+
+ to_kvm_svm(kvm)->sev_info.es_active = true;
+
+ return sev_guest_init(kvm, argp);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
struct sev_data_activate *data;
@@ -320,6 +342,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long first, last;
int ret;
+ lockdep_assert_held(&kvm->lock);
+
if (ulen == 0 || uaddr + ulen < uaddr)
return ERR_PTR(-EINVAL);
@@ -490,6 +514,96 @@ e_free:
return ret;
}
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(svm->vmsa, save, sizeof(*save));
+
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_update_vmsa *vmsa;
+ int i, ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
+ if (!vmsa)
+ return -ENOMEM;
+
+ for (i = 0; i < kvm->created_vcpus; i++) {
+ struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ goto e_free;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place
+ * encryption of the VMSA memory content (i.e it will write
+ * the same memory region with the guest's key), so invalidate
+ * it first.
+ */
+ clflush_cache_range(svm->vmsa, PAGE_SIZE);
+
+ vmsa->handle = sev->handle;
+ vmsa->address = __sme_pa(svm->vmsa);
+ vmsa->len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+ &argp->error);
+ if (ret)
+ goto e_free;
+
+ svm->vcpu.arch.guest_state_protected = true;
+ }
+
+e_free:
+ kfree(vmsa);
+ return ret;
+}
+
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
@@ -642,8 +756,8 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
* Its safe to read more than we are asked, caller should ensure that
* destination has enough space.
*/
- src_paddr = round_down(src_paddr, 16);
offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
sz = round_up(sz + offset, 16);
return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
@@ -932,7 +1046,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
struct kvm_sev_cmd sev_cmd;
int r;
- if (!svm_sev_enabled())
+ if (!svm_sev_enabled() || !sev)
return -ENOTTY;
if (!argp)
@@ -947,12 +1061,18 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_ES_INIT:
+ r = sev_es_guest_init(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_UPDATE_DATA:
r = sev_launch_update_data(kvm, &sev_cmd);
break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_MEASURE:
r = sev_launch_measure(kvm, &sev_cmd);
break;
@@ -1001,12 +1121,20 @@ int svm_register_enc_region(struct kvm *kvm,
if (!region)
return -ENOMEM;
+ mutex_lock(&kvm->lock);
region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
if (IS_ERR(region->pages)) {
ret = PTR_ERR(region->pages);
+ mutex_unlock(&kvm->lock);
goto e_free;
}
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
@@ -1015,13 +1143,6 @@ int svm_register_enc_region(struct kvm *kvm,
*/
sev_clflush_pages(region->pages, region->npages);
- region->uaddr = range->addr;
- region->size = range->size;
-
- mutex_lock(&kvm->lock);
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
return ret;
e_free:
@@ -1125,49 +1246,61 @@ void sev_vm_destroy(struct kvm *kvm)
sev_asid_free(sev->asid);
}
-int __init sev_hardware_setup(void)
+void __init sev_hardware_setup(void)
{
- struct sev_user_data_status *status;
- int rc;
+ unsigned int eax, ebx, ecx, edx;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ /* Does the CPU support SEV? */
+ if (!boot_cpu_has(X86_FEATURE_SEV))
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
/* Maximum number of encrypted guests supported simultaneously */
- max_sev_asid = cpuid_ecx(0x8000001F);
+ max_sev_asid = ecx;
if (!svm_sev_enabled())
- return 1;
+ goto out;
/* Minimum ASID value that should be used for SEV guest */
- min_sev_asid = cpuid_edx(0x8000001F);
+ min_sev_asid = edx;
/* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_asid_bitmap)
- return 1;
+ goto out;
sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_reclaim_asid_bitmap)
- return 1;
+ goto out;
- status = kmalloc(sizeof(*status), GFP_KERNEL);
- if (!status)
- return 1;
+ pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+ sev_supported = true;
- /*
- * Check SEV platform status.
- *
- * PLATFORM_STATUS can be called in any state, if we failed to query
- * the PLATFORM status then either PSP firmware does not support SEV
- * feature or SEV firmware is dead.
- */
- rc = sev_platform_status(status, NULL);
- if (rc)
- goto err;
+ /* SEV-ES support requested? */
+ if (!sev_es)
+ goto out;
- pr_info("SEV supported\n");
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
-err:
- kfree(status);
- return rc;
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+ sev_es_supported = true;
+
+out:
+ sev = sev_supported;
+ sev_es = sev_es_supported;
}
void sev_hardware_teardown(void)
@@ -1181,13 +1314,327 @@ void sev_hardware_teardown(void)
sev_flush_asids();
}
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
+ unsigned long len)
+{
+ /*
+ * If hardware enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, nothing to do.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
+ return;
+
+ /*
+ * If the VM Page Flush MSR is supported, use it to flush the page
+ * (using the page virtual address and the guest ASID).
+ */
+ if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
+ struct kvm_sev_info *sev;
+ unsigned long va_start;
+ u64 start, stop;
+
+ /* Align start and stop to page boundaries. */
+ va_start = (unsigned long)va;
+ start = (u64)va_start & PAGE_MASK;
+ stop = PAGE_ALIGN((u64)va_start + len);
+
+ if (start < stop) {
+ sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+
+ while (start < stop) {
+ wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
+ start | sev->asid);
+
+ start += PAGE_SIZE;
+ }
+
+ return;
+ }
+
+ WARN(1, "Address overflow, using WBINVD\n");
+ }
+
+ /*
+ * Hardware should always have one of the above features,
+ * but if not, use WBINVD and issue a warning.
+ */
+ WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
+ wbinvd_on_all_cpus();
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE);
+ __free_page(virt_to_page(svm->vmsa));
+
+ if (svm->ghcb_sa_free)
+ kfree(svm->ghcb_sa);
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct ghcb *ghcb = svm->ghcb;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+
+ pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values, even if they may not have been written during the
+ * VM-Exit. It's the guest's responsibility to not consume random data.
+ */
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb);
+
+ svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb);
+
+ if (ghcb_xcr0_is_valid(ghcb)) {
+ vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
+ kvm_update_cpuid_runtime(vcpu);
+ }
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
+ control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu;
+ struct ghcb *ghcb;
+ u64 exit_code = 0;
+
+ ghcb = svm->ghcb;
+
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage)
+ goto vmgexit_err;
+
+ /*
+ * Retrieve the exit code now even though is may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+
+ if (!ghcb_sw_exit_code_is_valid(ghcb) ||
+ !ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ !ghcb_sw_exit_info_2_is_valid(ghcb))
+ goto vmgexit_err;
+
+ switch (ghcb_get_sw_exit_code(ghcb)) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!ghcb_rax_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ if (ghcb_get_rax(ghcb) == 0xd)
+ if (!ghcb_xcr0_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) {
+ if (!ghcb_sw_scratch_is_valid(ghcb))
+ goto vmgexit_err;
+ } else {
+ if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK))
+ if (!ghcb_rax_is_valid(ghcb))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ if (ghcb_get_sw_exit_info_1(ghcb)) {
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rdx_is_valid(ghcb))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_cpl_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb) ||
+ !ghcb_rdx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!ghcb_sw_scratch_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ break;
+ default:
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ vcpu = &svm->vcpu;
+
+ if (ghcb->ghcb_usage) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ ghcb->ghcb_usage);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+
+ return -EINVAL;
+}
+
+static void pre_sev_es_run(struct vcpu_svm *svm)
+{
+ if (!svm->ghcb)
+ return;
+
+ if (svm->ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ ghcb_get_sw_scratch(svm->ghcb),
+ svm->ghcb_sa, svm->ghcb_sa_len);
+ svm->ghcb_sa_sync = false;
+ }
+
+ kfree(svm->ghcb_sa);
+ svm->ghcb_sa = NULL;
+ svm->ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true);
+ svm->ghcb = NULL;
+}
+
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
+ /* Perform any SEV-ES pre-run actions */
+ pre_sev_es_run(svm);
+
/* Assign the asid allocated with this SEV guest */
- svm->vmcb->control.asid = asid;
+ svm->asid = asid;
/*
* Flush guest TLB:
@@ -1203,3 +1650,415 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct ghcb *ghcb = svm->ghcb;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ return false;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ return false;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_1);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ return false;
+ }
+
+ scratch_va = (void *)svm->ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ return false;
+ }
+ scratch_va = kzalloc(len, GFP_KERNEL);
+ if (!scratch_va)
+ return false;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kfree(scratch_va);
+ return false;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->ghcb_sa_sync = sync;
+ svm->ghcb_sa_free = true;
+ }
+
+ svm->ghcb_sa = scratch_va;
+ svm->ghcb_sa_len = len;
+
+ return true;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ if (!ret) {
+ ret = -EINVAL;
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+ fallthrough;
+ }
+ default:
+ ret = -EINVAL;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+}
+
+int sev_handle_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ struct ghcb *ghcb;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+ return -EINVAL;
+ }
+
+ if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+ return -EINVAL;
+ }
+
+ svm->ghcb = svm->ghcb_map.hva;
+ ghcb = svm->ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ sev_es_sync_from_ghcb(svm);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ ret = -EINVAL;
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
+ break;
+
+ ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
+ break;
+
+ ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ break;
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ ghcb_set_sw_exit_info_1(ghcb, 1);
+ ghcb_set_sw_exit_info_2(ghcb,
+ X86_TRAP_UD |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(&svm->vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ break;
+ default:
+ ret = svm_invoke_exit_handler(svm, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+ return -EINVAL;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port,
+ svm->ghcb_sa, svm->ghcb_sa_len, in);
+}
+
+void sev_es_init_vmcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key.
+ */
+ svm->vmcb->control.vmsa_pa = __pa(svm->vmsa);
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ /* No support for enable_vmware_backdoor */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /* Clear intercepts on selected MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+void sev_es_create_vcpu(struct vcpu_svm *svm)
+{
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when creating
+ * a vCPU for an SEV-ES guest.
+ */
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct vmcb_save_area *hostsa;
+ unsigned int i;
+
+ /*
+ * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
+ * of which one step is to perform a VMLOAD. Since hardware does not
+ * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ */
+ asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
+
+ /*
+ * Certain MSRs are restored on VMEXIT, only save ones that aren't
+ * restored.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
+ if (host_save_user_msrs[i].sev_es_restored)
+ continue;
+
+ rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
+ }
+
+ /* XCR0 is restored on VMEXIT, save the current host value */
+ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ /* PKRU is restored on VMEXIT, save the curent host value */
+ hostsa->pkru = read_pkru();
+
+ /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
+ hostsa->xss = host_xss;
+}
+
+void sev_es_vcpu_put(struct vcpu_svm *svm)
+{
+ unsigned int i;
+
+ /*
+ * Certain MSRs are restored on VMEXIT and were saved with vmsave in
+ * sev_es_vcpu_load() above. Only restore ones that weren't.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
+ if (host_save_user_msrs[i].sev_es_restored)
+ continue;
+
+ wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
+ }
+}
+
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* First SIPI: Use the values as initially set by the VMM */
+ if (!svm->received_first_sipi) {
+ svm->received_first_sipi = true;
+ return;
+ }
+
+ /*
+ * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
+ * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
+ * non-zero value.
+ */
+ ghcb_set_sw_exit_info_2(svm->ghcb, 1);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2f32fd09e259..3442d44ca53b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -33,9 +33,9 @@
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
-#include <asm/mce.h>
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
+ bool always; /* True if intercept is initially cleared */
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
@@ -108,6 +108,9 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_EFER, .always = false },
+ { .index = MSR_IA32_CR_PAT, .always = false },
+ { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
{ .index = MSR_INVALID, .always = false },
};
@@ -187,10 +190,14 @@ static int vgif = true;
module_param(vgif, int, 0444);
/* enable/disable SEV support */
-static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);
-static bool __read_mostly dump_invalid_vmcb = 0;
+/* enable/disable SEV-ES support */
+int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev_es, int, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -336,6 +343,13 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
if (nrips && svm->vmcb->control.next_rip != 0) {
WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
@@ -347,6 +361,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
} else {
kvm_rip_write(vcpu, svm->next_rip);
}
+
+done:
svm_set_interrupt_shadow(vcpu, 0);
return 1;
@@ -438,6 +454,11 @@ static int has_svm(void)
return 0;
}
+ if (sev_active()) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return 0;
+ }
+
return 1;
}
@@ -484,7 +505,7 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_EFER, efer | EFER_SVME);
- wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+ wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
@@ -530,12 +551,12 @@ static int svm_hardware_enable(void)
static void svm_cpu_uninit(int cpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
if (!sd)
return;
- per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+ per_cpu(svm_data, cpu) = NULL;
kfree(sd->sev_vmcbs);
__free_page(sd->save_area);
kfree(sd);
@@ -552,6 +573,7 @@ static int svm_cpu_init(int cpu)
sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
goto free_cpu_data;
+ clear_page(page_address(sd->save_area));
if (svm_sev_enabled()) {
sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
@@ -662,8 +684,8 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
msrpm[offset] = tmp;
}
-static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
{
set_shadow_msr_intercept(vcpu, msr, read, write);
set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
@@ -959,15 +981,11 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
- if (sev) {
- if (boot_cpu_has(X86_FEATURE_SEV) &&
- IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
- r = sev_hardware_setup();
- if (r)
- sev = false;
- } else {
- sev = false;
- }
+ if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) {
+ sev_hardware_setup();
+ } else {
+ sev = false;
+ sev_es = false;
}
svm_adjust_mmio_mask();
@@ -1215,6 +1233,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr4 = 0;
}
svm->asid_generation = 0;
+ svm->asid = 0;
svm->nested.vmcb12_gpa = 0;
svm->vcpu.arch.hflags = 0;
@@ -1252,6 +1271,11 @@ static void init_vmcb(struct vcpu_svm *svm)
if (sev_guest(svm->vcpu.kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
+
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /* Perform SEV-ES specific VMCB updates */
+ sev_es_init_vmcb(svm);
+ }
}
vmcb_mark_all_dirty(svm->vmcb);
@@ -1288,6 +1312,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb_page;
+ struct page *vmsa_page = NULL;
int err;
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1298,9 +1323,27 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (!vmcb_page)
goto out;
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /*
+ * SEV-ES guests require a separate VMSA page used to contain
+ * the encrypted register state of the guest.
+ */
+ vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmsa_page)
+ goto error_free_vmcb_page;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Free the fpu structure to prevent KVM from attempting to
+ * access the FPU state.
+ */
+ kvm_free_guest_fpu(vcpu);
+ }
+
err = avic_init_vcpu(svm);
if (err)
- goto error_free_vmcb_page;
+ goto error_free_vmsa_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -1309,21 +1352,34 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->avic_is_running = true;
svm->msrpm = svm_vcpu_alloc_msrpm();
- if (!svm->msrpm)
- goto error_free_vmcb_page;
+ if (!svm->msrpm) {
+ err = -ENOMEM;
+ goto error_free_vmsa_page;
+ }
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
svm->vmcb = page_address(vmcb_page);
svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+
+ if (vmsa_page)
+ svm->vmsa = page_address(vmsa_page);
+
svm->asid_generation = 0;
init_vmcb(svm);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
+ if (sev_es_guest(svm->vcpu.kvm))
+ /* Perform SEV-ES specific VMCB creation updates */
+ sev_es_create_vcpu(svm);
+
return 0;
+error_free_vmsa_page:
+ if (vmsa_page)
+ __free_page(vmsa_page);
error_free_vmcb_page:
__free_page(vmcb_page);
out:
@@ -1351,6 +1407,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
svm_free_nested(svm);
+ sev_free_vcpu(vcpu);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
}
@@ -1366,15 +1424,20 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmcb_mark_all_dirty(svm->vmcb);
}
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ sev_es_vcpu_load(svm, cpu);
+ } else {
#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i].index,
+ svm->host_user_msrs[i]);
+ }
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
@@ -1402,18 +1465,24 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
avic_vcpu_put(vcpu);
++vcpu->stat.host_state_reload;
- kvm_load_ldt(svm->host.ldt);
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ sev_es_vcpu_put(svm);
+ } else {
+ kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
- load_gs_index(svm->host.gs);
+ loadsegment(fs, svm->host.fs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ load_gs_index(svm->host.gs);
#else
#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
+ loadsegment(gs, svm->host.gs);
#endif
#endif
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i].index,
+ svm->host_user_msrs[i]);
+ }
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1631,9 +1700,18 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
static void update_cr0_intercept(struct vcpu_svm *svm)
{
- ulong gcr0 = svm->vcpu.arch.cr0;
- u64 *hcr0 = &svm->vmcb->save.cr0;
+ ulong gcr0;
+ u64 *hcr0;
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(svm->vcpu.kvm))
+ return;
+ gcr0 = svm->vcpu.arch.cr0;
+ hcr0 = &svm->vmcb->save.cr0;
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
@@ -1653,7 +1731,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
@@ -1682,13 +1760,15 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
update_cr0_intercept(svm);
}
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
- unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+ return true;
+}
- if (cr4 & X86_CR4_VMXE)
- return 1;
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
svm_flush_tlb(vcpu);
@@ -1699,7 +1779,9 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
- return 0;
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1751,18 +1833,20 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
++sd->asid_generation;
sd->next_asid = sd->min_asid;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
svm->asid_generation = sd->asid_generation;
- svm->vmcb->control.asid = sd->next_asid++;
-
- vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ svm->asid = sd->next_asid++;
}
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
{
struct vmcb *vmcb = svm->vmcb;
+ if (svm->vcpu.arch.guest_state_protected)
+ return;
+
if (unlikely(value != vmcb->save.dr6)) {
vmcb->save.dr6 = value;
vmcb_mark_dirty(vmcb, VMCB_DR);
@@ -1773,6 +1857,9 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (vcpu->arch.guest_state_protected)
+ return;
+
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
@@ -1791,6 +1878,9 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (vcpu->arch.guest_state_protected)
+ return;
+
svm->vmcb->save.dr7 = value;
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
@@ -1929,25 +2019,6 @@ static bool is_erratum_383(void)
return true;
}
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs);
-#endif
-}
-
static void svm_handle_mce(struct vcpu_svm *svm)
{
if (is_erratum_383()) {
@@ -1979,6 +2050,13 @@ static int shutdown_interception(struct vcpu_svm *svm)
struct kvm_run *kvm_run = svm->vcpu.run;
/*
+ * The VM save area has already been encrypted so it
+ * cannot be reinitialized - just terminate.
+ */
+ if (sev_es_guest(svm->vcpu.kvm))
+ return -EINVAL;
+
+ /*
* VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it.
*/
@@ -1999,11 +2077,16 @@ static int io_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string)
- return kvm_emulate_instruction(vcpu, 0);
-
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
svm->next_rip = svm->vmcb->control.exit_info_2;
return kvm_fast_pio(&svm->vcpu, size, port, in);
@@ -2267,9 +2350,11 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm_clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+ svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ }
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
return 1;
}
@@ -2406,6 +2491,41 @@ static int cr_interception(struct vcpu_svm *svm)
return kvm_complete_insn_gp(&svm->vcpu, err);
}
+static int cr_trap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(&svm->vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
@@ -2459,6 +2579,25 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
+static int efer_trap(struct vcpu_svm *svm)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(&svm->vcpu, ret);
+}
+
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
{
msr->data = 0;
@@ -2541,10 +2680,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = svm->spec_ctrl;
@@ -2582,6 +2718,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm) || !err)
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+
+ ghcb_set_sw_exit_info_1(svm->ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->ghcb,
+ X86_TRAP_GP |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ return 1;
+}
+
static int rdmsr_interception(struct vcpu_svm *svm)
{
return kvm_emulate_rdmsr(&svm->vcpu);
@@ -2628,10 +2778,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
if (kvm_spec_ctrl_test_value(data))
@@ -2656,12 +2803,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
- if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
return 1;
if (!data)
break;
@@ -2803,7 +2950,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
static int pause_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ bool in_kernel;
+
+ /*
+ * CPL is not made available for an SEV-ES guest, therefore
+ * vcpu->arch.preempted_in_kernel can never be true. Just
+ * set in_kernel to false as well.
+ */
+ in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
@@ -2918,11 +3072,16 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
+ [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+ [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -2964,6 +3123,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
@@ -2971,6 +3131,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3043,6 +3204,43 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code])
+ return 0;
+
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+
+ return -EINVAL;
+}
+
+int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+{
+ if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+ return 0;
+
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(svm);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(svm);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(svm);
+ else if (exit_code == SVM_EXIT_HLT)
+ return halt_interception(svm);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(svm);
+#endif
+ return svm_exit_handlers[exit_code](svm);
+}
+
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
@@ -3066,10 +3264,13 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
+ if (!sev_es_guest(vcpu->kvm)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ }
if (is_guest_mode(vcpu)) {
int vmexit;
@@ -3106,32 +3307,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
- dump_vmcb(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
- return 0;
- }
-
-#ifdef CONFIG_RETPOLINE
- if (exit_code == SVM_EXIT_MSR)
- return msr_interception(svm);
- else if (exit_code == SVM_EXIT_VINTR)
- return interrupt_window_interception(svm);
- else if (exit_code == SVM_EXIT_INTR)
- return intr_interception(svm);
- else if (exit_code == SVM_EXIT_HLT)
- return halt_interception(svm);
- else if (exit_code == SVM_EXIT_NPF)
- return npf_interception(svm);
-#endif
- return svm_exit_handlers[exit_code](svm);
+ return svm_invoke_exit_handler(svm, exit_code);
}
static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3160,7 +3336,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm_set_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3181,6 +3358,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
if (nested_svm_virtualize_tpr(vcpu))
return;
@@ -3237,10 +3421,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm_set_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm_clr_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3252,7 +3438,14 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (is_guest_mode(vcpu)) {
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /*
+ * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
+ * bit to determine the state of the IF flag.
+ */
+ if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
+ return true;
+ } else if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
@@ -3411,8 +3604,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
- && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
+ if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(svm->vcpu.kvm) ||
+ kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
}
@@ -3435,6 +3629,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
+ * Never re-inject a #VC exception.
+ */
+ if (vector == X86_TRAP_VC)
+ break;
+
+ /*
* In case of software exceptions, do not reinject the vector,
* but re-execute the instruction instead. Rewind RIP first
* if we emulated INT3 before.
@@ -3482,8 +3682,6 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
-
static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
{
@@ -3507,16 +3705,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ __svm_sev_es_vcpu_run(svm->vmcb_pa);
+ } else {
+ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
#ifdef CONFIG_X86_64
- native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
- loadsegment(fs, svm->host.fs);
+ loadsegment(fs, svm->host.fs);
#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
+ loadsegment(gs, svm->host.gs);
#endif
#endif
+ }
/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
@@ -3542,6 +3744,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ trace_kvm_entry(vcpu);
+
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -3566,6 +3770,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
+ if (unlikely(svm->asid != svm->vmcb->control.asid)) {
+ svm->vmcb->control.asid = svm->asid;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
svm->vmcb->save.cr2 = vcpu->arch.cr2;
/*
@@ -3610,14 +3818,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- reload_tss(vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ reload_tss(vcpu);
x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ }
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
@@ -3720,12 +3931,21 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static bool svm_has_emulated_msr(u32 index)
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return false;
+ case MSR_IA32_SMBASE:
+ /* SEV-ES guests do not support SMM, so report false */
+ if (kvm && sev_es_guest(kvm))
+ return false;
+ break;
default:
break;
}
@@ -3741,6 +3961,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_cpuid_entry2 *best;
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -3753,6 +3974,13 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* Check again if INVPCID interception if required */
svm_check_invpcid(svm);
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ if (sev_guest(vcpu->kvm)) {
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
+ if (best)
+ vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+ }
+
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -4076,6 +4304,12 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i
unsigned long cr4;
/*
+ * When the guest is an SEV-ES guest, emulation is not possible.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return false;
+
+ /*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
@@ -4155,6 +4389,14 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
(vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
}
+static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ if (!sev_es_guest(vcpu->kvm))
+ return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
+
+ sev_vcpu_deliver_sipi_vector(vcpu, vector);
+}
+
static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
@@ -4207,6 +4449,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
.get_idt = svm_get_idt,
@@ -4295,6 +4538,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
.msr_filter_changed = svm_msr_filter_changed,
+ .complete_emulated_msr = svm_complete_emulated_msr,
+
+ .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
};
static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1d853fe4c778..6e7d070f8b86 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -17,21 +17,32 @@
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
+#include <linux/bits.h>
#include <asm/svm.h>
-static const u32 host_save_user_msrs[] = {
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+static const struct svm_host_save_msrs {
+ u32 index; /* Index of the MSR */
+ bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */
+} host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
+ { .index = MSR_STAR, .sev_es_restored = true },
+ { .index = MSR_LSTAR, .sev_es_restored = true },
+ { .index = MSR_CSTAR, .sev_es_restored = true },
+ { .index = MSR_SYSCALL_MASK, .sev_es_restored = true },
+ { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true },
+ { .index = MSR_FS_BASE, .sev_es_restored = true },
#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_TSC_AUX,
+ { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true },
+ { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true },
+ { .index = MSR_TSC_AUX, .sev_es_restored = false },
};
-
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define MAX_DIRECT_ACCESS_MSRS 15
+#define MAX_DIRECT_ACCESS_MSRS 18
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -61,11 +72,13 @@ enum {
struct kvm_sev_info {
bool active; /* SEV enabled guest */
+ bool es_active; /* SEV-ES enabled guest */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
+ u64 ap_jump_table; /* SEV-ES AP Jump Table address */
};
struct kvm_svm {
@@ -106,6 +119,7 @@ struct vcpu_svm {
struct vmcb *vmcb;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
+ u32 asid;
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
@@ -166,6 +180,18 @@ struct vcpu_svm {
DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
} shadow_msr_intercept;
+
+ /* SEV-ES support */
+ struct vmcb_save_area *vmsa;
+ struct ghcb *ghcb;
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+
+ /* SEV-ES scratch area support */
+ void *ghcb_sa;
+ u64 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
};
struct svm_cpu_data {
@@ -193,6 +219,28 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static inline bool sev_es_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev_guest(kvm) && sev->es_active;
+#else
+ return false;
+#endif
+}
+
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -244,21 +292,24 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ }
+
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -270,6 +321,12 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ /* DR7 access must remain intercepted for an SEV-ES guest */
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ }
+
recalc_intercepts(svm);
}
@@ -346,11 +403,12 @@ static inline bool gif_set(struct vcpu_svm *svm)
}
/* svm.c */
-#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U
-#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U
-#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U
#define MSR_INVALID 0xffffffffU
+extern int sev;
+extern int sev_es;
+extern bool dump_invalid_vmcb;
+
u32 svm_msrpm_offset(u32 msr);
u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
@@ -358,13 +416,16 @@ void svm_vcpu_free_msrpm(u32 *msrpm);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
+int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write);
/* nested.c */
@@ -470,18 +531,42 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
-extern unsigned int max_sev_asid;
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_MSR_INFO_POS 0
+#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
+
+#define GHCB_MSR_SEV_INFO_RESP 0x001
+#define GHCB_MSR_SEV_INFO_REQ 0x002
+#define GHCB_MSR_VER_MAX_POS 48
+#define GHCB_MSR_VER_MAX_MASK 0xffff
+#define GHCB_MSR_VER_MIN_POS 32
+#define GHCB_MSR_VER_MIN_MASK 0xffff
+#define GHCB_MSR_CBIT_POS 24
+#define GHCB_MSR_CBIT_MASK 0xff
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
+ (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
+ (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+ GHCB_MSR_SEV_INFO_RESP)
+
+#define GHCB_MSR_CPUID_REQ 0x004
+#define GHCB_MSR_CPUID_RESP 0x005
+#define GHCB_MSR_CPUID_FUNC_POS 32
+#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
+#define GHCB_MSR_CPUID_VALUE_POS 32
+#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
+#define GHCB_MSR_CPUID_REG_POS 30
+#define GHCB_MSR_CPUID_REG_MASK 0x3
+
+#define GHCB_MSR_TERM_REQ 0x100
+#define GHCB_MSR_TERM_REASON_SET_POS 12
+#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
+#define GHCB_MSR_TERM_REASON_POS 16
+#define GHCB_MSR_TERM_REASON_MASK 0xff
-static inline bool sev_guest(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_AMD_SEV
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->active;
-#else
- return false;
-#endif
-}
+extern unsigned int max_sev_asid;
static inline bool svm_sev_enabled(void)
{
@@ -495,7 +580,20 @@ int svm_register_enc_region(struct kvm *kvm,
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
-int __init sev_hardware_setup(void);
+void __init sev_hardware_setup(void);
void sev_hardware_teardown(void);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_init_vmcb(struct vcpu_svm *svm);
+void sev_es_create_vcpu(struct vcpu_svm *svm);
+void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
+void sev_es_vcpu_put(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+
+/* vmenter.S */
+
+void __svm_sev_es_vcpu_run(unsigned long vmcb_pa);
+void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 1ec1ac40e328..6feb8c08f45a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -168,3 +168,53 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
ret
SYM_FUNC_END(__svm_vcpu_run)
+
+/**
+ * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+ * @vmcb_pa: unsigned long
+ */
+SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /* Enter guest mode */
+ mov %_ASM_ARG1, %_ASM_AX
+ sti
+
+1: vmrun %_ASM_AX
+ jmp 3f
+2: cmpb $0, kvm_rebooting
+ jne 3f
+ ud2
+ _ASM_EXTABLE(1b, 2b)
+
+3: cli
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index aef960f90f26..2de30c20bc26 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1578,6 +1578,103 @@ TRACE_EVENT(kvm_hv_syndbg_get_msr,
__entry->vcpu_id, __entry->vp_index, __entry->msr,
__entry->data)
);
+
+/*
+ * Tracepoint for the start of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_enter,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_exit,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_enter,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa),
+ TP_ARGS(vcpu_id, ghcb_gpa),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx",
+ __entry->vcpu_id, __entry->ghcb_gpa)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result),
+ TP_ARGS(vcpu_id, ghcb_gpa, result),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ __entry->result = result;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx, result %d",
+ __entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index e5325bd0f304..41f24661af04 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
-
}
#endif
@@ -327,7 +326,6 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* vmcs_version represents the range of supported Enlightened VMCS
* versions: lower 8 bits is the minimal version, higher 8 bits is the
@@ -335,7 +333,7 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
* KVM_EVMCS_VERSION.
*/
if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
- vmx->nested.enlightened_vmcs_enabled)
+ (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled))
return (KVM_EVMCS_VERSION << 8) | 1;
return 0;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index e5f7a7ebf27d..bd41d9462355 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 89af692deb7e..f2b9bfb58206 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2952,7 +2952,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
{
if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
return -EINVAL;
return 0;
@@ -3123,13 +3124,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_host_map *map;
- struct page *page;
- u64 hpa;
/*
* hv_evmcs may end up being not mapped after migration (when
@@ -3152,6 +3149,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
}
+ return true;
+}
+
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_host_map *map;
+ struct page *page;
+ u64 hpa;
+
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
/*
* Translate L1 physical address to host physical
@@ -3220,6 +3228,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+
+ return true;
+}
+
+static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ if (!nested_get_evmcs_page(vcpu))
+ return false;
+
+ if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
+ return false;
+
return true;
}
@@ -3559,19 +3579,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
*/
nested_cache_shadow_vmcs12(vcpu, vmcs12);
- /*
- * If we're entering a halted L2 vcpu and the L2 vcpu won't be
- * awakened by event injection or by an NMI-window VM-exit or
- * by an interrupt-window VM-exit, halt the vcpu.
- */
- if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
- !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
- !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
- !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
- (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ switch (vmcs12->guest_activity_state) {
+ case GUEST_ACTIVITY_HLT:
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
+ !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ break;
+ case GUEST_ACTIVITY_WAIT_SIPI:
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ break;
+ default:
+ break;
}
+
return 1;
vmentry_failed:
@@ -3797,7 +3827,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
nested_vmx_update_pending_dbg(vcpu);
clear_bit(KVM_APIC_INIT, &apic->pending_events);
- nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ return 0;
+ }
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
+ apic->sipi_vector & 0xFFUL);
return 0;
}
@@ -4036,6 +4079,8 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
@@ -4416,6 +4461,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
/* Service the TLB flush request for L2 before switching to L1. */
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
@@ -4814,7 +4861,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
/*
* The Intel VMX Instruction Reference lists a bunch of bits that are
* prerequisite to running VMXON, most notably cr4.VMXE must be set to
- * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
* Otherwise, we should fail with #UD. But most faulting conditions
* have already been checked by hardware, prior to the VM-exit for
* VMXON. We do test guest cr4.VMXE because processor CR4 always has
@@ -6049,11 +6096,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (vmx->nested.hv_evmcs)
- copy_enlightened_to_vmcs12(vmx);
- else if (enable_shadow_vmcs)
- copy_shadow_to_vmcs12(vmx);
+ } else {
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+ if (!vmx->nested.need_vmcs12_to_shadow_sync) {
+ if (vmx->nested.hv_evmcs)
+ copy_enlightened_to_vmcs12(vmx);
+ else if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+ }
}
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
@@ -6483,7 +6533,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
- VMX_MISC_ACTIVITY_HLT;
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_WAIT_SIPI;
msrs->misc_high = 0;
/*
@@ -6573,7 +6624,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
- .get_nested_state_pages = nested_get_vmcs12_pages,
+ .get_nested_state_pages = vmx_get_nested_state_pages,
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index a886a47daebd..cdf5f34518f4 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
/* mapping between fixed pmc index and intel_arch_events array */
@@ -345,7 +345,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -355,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters =
min_t(int, edx.split.num_counters_fixed,
x86_pmu.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int,
+ edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 90ad7a6246e3..e85aa5faa22d 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -132,7 +132,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- cmpb $0, %bl
+ testb %bl, %bl
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d14c94d0aff1..eb69fef57485 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -40,7 +40,6 @@
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
#include <asm/perf_event.h>
-#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/mwait.h>
@@ -1826,7 +1825,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -2028,7 +2027,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
if (kvm_spec_ctrl_test_value(data))
@@ -2063,12 +2062,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
goto find_uret_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
return 1;
if (!data)
break;
@@ -2560,8 +2559,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
- if (static_branch_unlikely(&enable_evmcs))
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
return 0;
}
@@ -3093,8 +3094,25 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all is
+ * handled by kvm_is_valid_cr4().
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
+}
+
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
+ unsigned long old_cr4 = vcpu->arch.cr4;
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
@@ -3121,21 +3139,6 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
}
- if (cr4 & X86_CR4_VMXE) {
- /*
- * To use VMXON (and later other VMX instructions), a guest
- * must first be able to turn on cr4.VMXE (see handle_vmon()).
- * So basically the check on whether to allow nested VMX
- * is here. We operate under the default treatment of SMM,
- * so VMX cannot be enabled under SMM.
- */
- if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
- return 1;
- }
-
- if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
- return 1;
-
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
@@ -3166,7 +3169,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
- return 0;
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -3513,42 +3518,33 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
return true;
}
-static int init_rmode_tss(struct kvm *kvm)
+static int init_rmode_tss(struct kvm *kvm, void __user *ua)
{
- gfn_t fn;
- u16 data = 0;
- int idx, r;
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ u16 data;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
+ return -EFAULT;
+ }
- idx = srcu_read_lock(&kvm->srcu);
- fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data,
- TSS_IOPB_BASE_OFFSET, sizeof(u16));
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
+ if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
+ return -EFAULT;
+
data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data,
- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
+ return -EFAULT;
+
+ return 0;
}
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
int i, r = 0;
- kvm_pfn_t identity_map_pfn;
+ void __user *uaddr;
u32 tmp;
/* Protect kvm_vmx->ept_identity_pagetable_done. */
@@ -3559,24 +3555,24 @@ static int init_rmode_identity_map(struct kvm *kvm)
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
- if (r < 0)
+ uaddr = __x86_set_memory_region(kvm,
+ IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr,
+ PAGE_SIZE);
+ if (IS_ERR(uaddr)) {
+ r = PTR_ERR(uaddr);
goto out;
+ }
- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- r = kvm_write_guest_page(kvm, identity_map_pfn,
- &tmp, i * sizeof(tmp), sizeof(tmp));
- if (r < 0)
+ if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
+ r = -EFAULT;
goto out;
+ }
}
kvm_vmx->ept_identity_pagetable_done = true;
@@ -3603,19 +3599,22 @@ static void seg_setup(int seg)
static int alloc_apic_access_page(struct kvm *kvm)
{
struct page *page;
- int r = 0;
+ void __user *hva;
+ int ret = 0;
mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page_done)
goto out;
- r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (r)
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
goto out;
+ }
page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (is_error_page(page)) {
- r = -EFAULT;
+ ret = -EFAULT;
goto out;
}
@@ -3627,7 +3626,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page_done = true;
out:
mutex_unlock(&kvm->slots_lock);
- return r;
+ return ret;
}
int allocate_vpid(void)
@@ -4636,7 +4635,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
- int ret;
+ void __user *ret;
if (enable_unrestricted_guest)
return 0;
@@ -4646,10 +4645,12 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
PAGE_SIZE * 3);
mutex_unlock(&kvm->slots_lock);
- if (ret)
- return ret;
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
to_kvm_vmx(kvm)->tss_addr = addr;
- return init_rmode_tss(kvm);
+
+ return init_rmode_tss(kvm, ret);
}
static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
@@ -4714,25 +4715,6 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 1;
}
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs);
-#endif
-}
-
static int handle_machine_check(struct kvm_vcpu *vcpu)
{
/* handled by vmx_vcpu_run() */
@@ -6397,7 +6379,11 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_exception_nmi_irqoff(vmx);
}
-static bool vmx_has_emulated_msr(u32 index)
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -6667,6 +6653,8 @@ reenter_guest:
if (vmx->emulation_required)
return EXIT_FASTPATH_NONE;
+ trace_kvm_entry(vcpu);
+
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
vmcs_write32(PLE_WINDOW, vmx->ple_window);
@@ -6834,7 +6822,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx;
- unsigned long *msr_bitmap;
int i, cpu, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
@@ -6873,11 +6860,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
switch (index) {
case MSR_IA32_TSX_CTRL:
/*
- * No need to pass TSX_CTRL_CPUID_CLEAR through, so
- * let's avoid changing CPUID bits under the host
- * kernel's feet.
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
+ * interception. Keep the host value unchanged to avoid
+ * changing CPUID bits under the host kernel's feet.
+ *
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some
+ * combinations of new kernel and old userspace. If
+ * those guests run on a tsx=off host, do allow guests
+ * to use TSX_CTRL, but do not change the value on the
+ * host so that TSX remains always disabled.
*/
- vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ if (boot_cpu_has(X86_FEATURE_RTM))
+ vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ else
+ vmx->guest_uret_msrs[j].mask = 0;
break;
default:
vmx->guest_uret_msrs[j].mask = -1ull;
@@ -6894,7 +6890,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- msr_bitmap = vmx->vmcs01.msr_bitmap;
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
@@ -7558,7 +7553,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
- return to_vmx(vcpu)->nested.vmxon;
+ return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
@@ -7587,6 +7582,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
+static int vmx_cpu_dirty_log_size(void)
+{
+ return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_unsetup = hardware_unsetup,
@@ -7616,6 +7616,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
.set_efer = vmx_set_efer,
.get_idt = vmx_get_idt,
@@ -7715,6 +7716,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.migrate_timers = vmx_migrate_timers,
.msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+ .cpu_dirty_log_size = vmx_cpu_dirty_log_size,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
static __init int hardware_setup(void)
@@ -7832,6 +7837,7 @@ static __init int hardware_setup(void)
vmx_x86_ops.slot_disable_log_dirty = NULL;
vmx_x86_ops.flush_log_dirty = NULL;
vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
+ vmx_x86_ops.cpu_dirty_log_size = NULL;
}
if (!cpu_has_vmx_preemption_timer())
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f6f66e5c6510..9d3a557949ac 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -321,7 +321,7 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 397f599b20e5..b967c1c774a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -105,6 +105,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
@@ -197,7 +198,8 @@ EXPORT_SYMBOL_GPL(host_efer);
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
-static u64 __read_mostly host_xss;
+u64 __read_mostly host_xss;
+EXPORT_SYMBOL_GPL(host_xss);
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
@@ -255,24 +257,23 @@ static struct kmem_cache *x86_emulator_cache;
/*
* When called, it means the previous get/set msr reached an invalid msr.
- * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
- * to fail the caller.
+ * Return true if we want to ignore/silent this failed msr access.
*/
-static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
- u64 data, bool write)
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
{
const char *op = write ? "wrmsr" : "rdmsr";
if (ignore_msrs) {
if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
- op, msr, data);
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
/* Mask the error */
- return 0;
+ return true;
} else {
- vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
- op, msr, data);
- return -ENOENT;
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return false;
}
}
@@ -805,11 +806,29 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+{
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
+
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -848,18 +867,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops.set_cr0(vcpu, cr0);
- if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- }
-
- if ((cr0 ^ old_cr0) & update_bits)
- kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+ kvm_post_set_cr0(vcpu, old_cr0, cr0);
return 0;
}
@@ -873,6 +881,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
@@ -893,6 +904,9 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
if (static_cpu_has(X86_FEATURE_PKU) &&
(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
@@ -965,26 +979,36 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
- return -EINVAL;
+ return false;
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
- return -EINVAL;
+ return false;
- return 0;
+ return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
}
-EXPORT_SYMBOL_GPL(kvm_valid_cr4);
+EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
+
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
+{
+ unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
X86_CR4_SMEP;
- unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
- if (kvm_valid_cr4(vcpu, cr4))
+ if (!kvm_is_valid_cr4(vcpu, cr4))
return 1;
if (is_long_mode(vcpu)) {
@@ -1007,15 +1031,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- if (kvm_x86_ops.set_cr4(vcpu, cr4))
- return 1;
+ kvm_x86_ops.set_cr4(vcpu, cr4);
- if (((cr4 ^ old_cr4) & mmu_role_bits) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
- kvm_mmu_reset_context(vcpu);
-
- if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ kvm_post_set_cr4(vcpu, old_cr4, cr4);
return 0;
}
@@ -1042,7 +1060,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu) &&
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1376,16 +1394,24 @@ static u64 kvm_get_arch_capabilities(void)
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
- /*
- * On TAA affected systems:
- * - nothing to do if TSX is disabled on the host.
- * - we emulate TSX_CTRL if present on the host.
- * This lets the guest use VERW to clear CPU buffers.
- */
- if (!boot_cpu_has(X86_FEATURE_RTM))
- data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
- else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
+ data &= ~ARCH_CAP_TAA_NO;
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
data |= ARCH_CAP_TAA_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
+ }
return data;
}
@@ -1416,7 +1442,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
if (r == KVM_MSR_RET_INVALID) {
/* Unconditionally clear the output for simplicity */
*data = 0;
- r = kvm_msr_ignored_check(vcpu, index, 0, false);
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ r = 0;
}
if (r)
@@ -1540,7 +1567,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
struct msr_data msr;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return -EPERM;
+ return KVM_MSR_RET_FILTERED;
switch (index) {
case MSR_FS_BASE:
@@ -1581,7 +1608,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
- ret = kvm_msr_ignored_check(vcpu, index, data, true);
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
+ ret = 0;
return ret;
}
@@ -1599,7 +1627,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
int ret;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return -EPERM;
+ return KVM_MSR_RET_FILTERED;
msr.index = index;
msr.host_initiated = host_initiated;
@@ -1618,7 +1646,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
if (ret == KVM_MSR_RET_INVALID) {
/* Unconditionally clear *data for simplicity */
*data = 0;
- ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ ret = 0;
}
return ret;
@@ -1636,35 +1665,28 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
-static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->msr.error) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- } else if (is_read) {
+ int err = vcpu->run->msr.error;
+ if (!err) {
kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
-{
- return complete_emulated_msr(vcpu, true);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, err);
}
static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
{
- return complete_emulated_msr(vcpu, false);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
}
static u64 kvm_msr_reason(int r)
{
switch (r) {
- case -ENOENT:
+ case KVM_MSR_RET_INVALID:
return KVM_MSR_EXIT_REASON_UNKNOWN;
- case -EPERM:
+ case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
default:
return KVM_MSR_EXIT_REASON_INVAL;
@@ -1719,18 +1741,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
return 0;
}
- /* MSR read failed? Inject a #GP */
- if (r) {
+ if (!r) {
+ trace_kvm_msr_read(ecx, data);
+
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(vcpu, 0);
- return 1;
}
- trace_kvm_msr_read(ecx, data);
-
- kvm_rax_write(vcpu, data & -1u);
- kvm_rdx_write(vcpu, (data >> 32) & -1u);
- return kvm_skip_emulated_instruction(vcpu);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -1751,20 +1771,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- /* MSR write failed? Inject a #GP */
- if (r > 0) {
+ if (!r)
+ trace_kvm_msr_write(ecx, data);
+ else
trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(vcpu);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
+ xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
@@ -1965,7 +1983,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
struct kvm_arch *ka = &vcpu->kvm->arch;
if (vcpu->vcpu_id == 0 && !host_initiated) {
- if (ka->boot_vcpu_runs_old_kvmclock && old_msr)
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = old_msr;
@@ -3063,9 +3081,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
- }
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ } else if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
@@ -3463,29 +3481,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
msr_info->data = vcpu->arch.apf.msr_en_val;
break;
case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
case MSR_IA32_P5_MC_ADDR:
@@ -3642,6 +3694,27 @@ static inline bool kvm_can_mwait_in_guest(void)
boot_cpu_has(X86_FEATURE_ARAT);
}
+static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 __user *cpuid_arg)
+{
+ struct kvm_cpuid2 cpuid;
+ int r;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ return r;
+
+ r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ return r;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ return r;
+
+ return 0;
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -3678,6 +3751,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3726,7 +3800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
+ r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops.cpu_has_accelerated_tpr();
@@ -3863,6 +3937,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
+ break;
default:
r = -EINVAL;
break;
@@ -3961,7 +4038,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted)
+ if (vcpu->preempted && !vcpu->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
/*
@@ -4015,21 +4092,23 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
return (!lapic_in_kernel(vcpu) ||
kvm_apic_accept_pic_intr(vcpu));
}
-/*
- * if userspace requested an interrupt window, check that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
return kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
}
@@ -4161,6 +4240,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
{
process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+
/*
* In guest mode, payload delivery should be deferred,
* so that the L1 hypervisor can intercept #PF before
@@ -4443,6 +4525,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
+ if (!vcpu->arch.guest_fpu)
+ return;
+
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
memset(guest_xsave, 0, sizeof(struct kvm_xsave));
fill_xsave((u8 *) guest_xsave->region, vcpu);
@@ -4460,9 +4545,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- u64 xstate_bv =
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+ u64 xstate_bv;
+ u32 mxcsr;
+
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
+ xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -4575,6 +4665,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ if (vcpu->arch.pv_cpuid.enforce)
+ kvm_update_pv_runtime(vcpu);
return 0;
@@ -4937,25 +5029,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
- case KVM_GET_SUPPORTED_HV_CPUID: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
- goto out;
-
- r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
-
- r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
- goto out;
- r = 0;
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
- }
default:
r = -EINVAL;
}
@@ -5736,7 +5812,7 @@ static void kvm_init_msr_list(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
+ if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -7913,17 +7989,22 @@ void kvm_arch_exit(void)
kmem_cache_destroy(x86_fpu_cache);
}
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
- vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ vcpu->arch.mp_state = state;
return 1;
} else {
- vcpu->run->exit_reason = KVM_EXIT_HLT;
+ vcpu->run->exit_reason = reason;
return 0;
}
}
+
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+{
+ return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@ -7937,6 +8018,14 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
@@ -8118,7 +8207,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ /*
+ * if_flag is obsolete and useless, so do not bother
+ * setting it for SEV-ES guests. Userspace can just
+ * use kvm_run->ready_for_interrupt_injection.
+ */
+ kvm_run->if_flag = !vcpu->arch.guest_state_protected
+ && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -8708,6 +8804,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
+ /* Forbid vmenter if vcpu dirty ring is soft-full */
+ if (unlikely(vcpu->kvm->dirty_ring_size &&
+ kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+ vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+ trace_kvm_dirty_ring_exit(vcpu);
+ r = 0;
+ goto out;
+ }
+
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
@@ -8894,8 +8999,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops.request_immediate_exit(vcpu);
}
- trace_kvm_entry(vcpu);
-
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
@@ -9015,6 +9118,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
kvm_apic_accept_events(vcpu);
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
@@ -9183,9 +9287,14 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
kvm_save_current_fpu(vcpu->arch.user_fpu);
- /* PKRU is separately restored in kvm_x86_ops.run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
+ /*
+ * Guests with protected state can't have it set by the hypervisor,
+ * so skip trying to set it.
+ */
+ if (vcpu->arch.guest_fpu)
+ /* PKRU is separately restored in kvm_x86_ops.run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ ~XFEATURE_MASK_PKRU);
fpregs_mark_activate();
fpregs_unlock();
@@ -9198,7 +9307,12 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- kvm_save_current_fpu(vcpu->arch.guest_fpu);
+ /*
+ * Guests with protected state can't have it read by the hypervisor,
+ * so skip trying to save it.
+ */
+ if (vcpu->arch.guest_fpu)
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
@@ -9377,6 +9491,9 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -9394,9 +9511,11 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
- sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = kvm_read_cr3(vcpu);
+
+skip_protected_regs:
+ sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -9426,8 +9545,9 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
kvm_load_guest_fpu(vcpu);
kvm_apic_accept_events(vcpu);
- if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
- vcpu->arch.pv.pv_unhalted)
+ if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
+ vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
+ vcpu->arch.pv.pv_unhalted)
mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
else
mp_state->mp_state = vcpu->arch.mp_state;
@@ -9495,7 +9615,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
-static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
@@ -9503,31 +9623,31 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
* 64-bit mode (though maybe in a 32-bit code segment).
* CR4.PAE and EFER.LMA must be set.
*/
- if (!(sregs->cr4 & X86_CR4_PAE)
- || !(sregs->efer & EFER_LMA))
- return -EINVAL;
+ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+ return false;
+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+ return false;
} else {
/*
* Not in 64-bit mode: EFER.LMA is clear and the code
* segment cannot be 64-bit.
*/
if (sregs->efer & EFER_LMA || sregs->cs.l)
- return -EINVAL;
+ return false;
}
- return kvm_valid_cr4(vcpu, sregs->cr4);
+ return kvm_is_valid_cr4(vcpu, sregs->cr4);
}
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
- int cpuid_update_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
int ret = -EINVAL;
- if (kvm_valid_sregs(vcpu, sregs))
+ if (!kvm_is_valid_sregs(vcpu, sregs))
goto out;
apic_base_msr.data = sregs->apic_base;
@@ -9535,6 +9655,9 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (kvm_set_apic_base(vcpu, &apic_base_msr))
goto out;
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops.set_idt(vcpu, &dt);
@@ -9557,11 +9680,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
- (X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
- if (cpuid_update_needed)
- kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9573,14 +9692,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- max_bits = KVM_NR_INTERRUPTS;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap, max_bits);
- if (pending_vec < max_bits) {
- kvm_queue_interrupt(vcpu, pending_vec, false);
- pr_debug("Set back pending irq %d\n", pending_vec);
- }
-
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -9599,6 +9710,15 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+skip_protected_regs:
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -9623,6 +9743,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
+ if (vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
@@ -9702,6 +9825,9 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
vcpu_load(vcpu);
fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9722,6 +9848,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
vcpu_load(vcpu);
fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9780,6 +9909,9 @@ static int sync_regs(struct kvm_vcpu *vcpu)
static void fx_init(struct kvm_vcpu *vcpu)
{
+ if (!vcpu->arch.guest_fpu)
+ return;
+
fpstate_init(&vcpu->arch.guest_fpu->state);
if (boot_cpu_has(X86_FEATURE_XSAVES))
vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
@@ -9793,6 +9925,15 @@ static void fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= X86_CR0_ET;
}
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_fpu) {
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ vcpu->arch.guest_fpu = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
+
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -9829,7 +9970,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = -ENOMEM;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page);
@@ -9863,6 +10004,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -9888,7 +10030,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
free_guest_fpu:
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ kvm_free_guest_fpu(vcpu);
free_user_fpu:
kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
free_emulate_ctxt:
@@ -9942,7 +10084,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ kvm_free_guest_fpu(vcpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
@@ -9990,7 +10132,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (kvm_mpx_supported()) {
+ if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
void *mpx_state_buffer;
/*
@@ -10039,6 +10181,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_rip_write(vcpu, 0);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
int kvm_arch_hardware_enable(void)
{
@@ -10309,7 +10452,32 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
+/**
+ * __x86_set_memory_region: Setup KVM internal memory slot
+ *
+ * @kvm: the kvm pointer to the VM.
+ * @id: the slot ID to setup.
+ * @gpa: the GPA to install the slot (unused when @size == 0).
+ * @size: the size of the slot. Set to zero to uninstall a slot.
+ *
+ * This function helps to setup a KVM internal memory slot. Specify
+ * @size > 0 to install a new slot, while @size == 0 to uninstall a
+ * slot. The return code can be one of the following:
+ *
+ * HVA: on success (uninstall will return a bogus HVA)
+ * -errno: on error
+ *
+ * The caller should always use IS_ERR() to check the return value
+ * before use. Note, the KVM internal memory slots are guaranteed to
+ * remain valid and unchanged until the VM is destroyed, i.e., the
+ * GPA->HVA translation will not change. However, the HVA is a user
+ * address, i.e. its accessibility is not guaranteed, and must be
+ * accessed via __copy_{to,from}_user().
+ */
+void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size)
{
int i, r;
unsigned long hva, old_npages;
@@ -10318,12 +10486,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
/* Called with kvm->slots_lock held. */
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
- return -EINVAL;
+ return ERR_PTR_USR(-EINVAL);
slot = id_to_memslot(slots, id);
if (size) {
if (slot && slot->npages)
- return -EEXIST;
+ return ERR_PTR_USR(-EEXIST);
/*
* MAP_SHARED to prevent internal slot pages from being moved
@@ -10332,13 +10500,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, 0);
if (IS_ERR((void *)hva))
- return PTR_ERR((void *)hva);
+ return (void __user *)hva;
} else {
if (!slot || !slot->npages)
return 0;
old_npages = slot->npages;
- hva = 0;
+ hva = slot->userspace_addr;
}
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -10351,13 +10519,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
m.memory_size = size;
r = __kvm_set_memory_region(kvm, &m);
if (r < 0)
- return r;
+ return ERR_PTR_USR(r);
}
if (!size)
vm_munmap(hva, old_npages * PAGE_SIZE);
- return 0;
+ return (void __user *)hva;
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
@@ -10714,6 +10882,10 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
{
+ /* Can't read the RIP when guest state is protected, just return 0 */
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
if (is_64_bit_mode(vcpu))
return kvm_rip_read(vcpu);
return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
@@ -11223,6 +11395,180 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned int len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ // VMG change, at this point, we're always done
+ // RIP has already been advanced
+ return 1;
+ }
+
+ // More MMIO is needed
+ run->mmio.phys_addr = frag->gpa;
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ if (run->mmio.is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 1;
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
+
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
+ vcpu->arch.pio.count * vcpu->arch.pio.size);
+ vcpu->arch.pio.count = 0;
+
+ return 1;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count)
+{
+ int ret;
+
+ ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
+ data, count);
+ if (ret)
+ return ret;
+
+ vcpu->arch.pio.count = 0;
+
+ return 0;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count)
+{
+ int ret;
+
+ ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
+ data, count);
+ if (ret) {
+ vcpu->arch.pio.count = 0;
+ } else {
+ vcpu->arch.guest_ins_data = data;
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+ }
+
+ return 0;
+}
+
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in)
+{
+ return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
+ : kvm_sev_es_outs(vcpu, size, port, data, count);
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
@@ -11245,3 +11591,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3900ab0c6004..0f727b50bd3d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/mce.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
@@ -278,6 +279,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
extern u64 host_xcr0;
extern u64 supported_xcr0;
+extern u64 host_xss;
extern u64 supported_xss;
static inline bool kvm_mpx_supported(void)
@@ -366,17 +368,42 @@ static inline bool kvm_dr6_valid(u64 data)
return !(data >> 32);
}
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static inline void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs);
+#endif
+}
+
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
-int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
-#define KVM_MSR_RET_INVALID 2
+/*
+ * Internal error codes that are used to indicate that MSR emulation encountered
+ * an error that should result in #GP in the guest, unless userspace
+ * handles it.
+ */
+#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
+#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
@@ -398,7 +425,17 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
__reserved_bits |= X86_CR4_UMIP; \
if (!__cpu_has(__c, X86_FEATURE_VMX)) \
__reserved_bits |= X86_CR4_VMXE; \
+ if (!__cpu_has(__c, X86_FEATURE_PCID)) \
+ __reserved_bits |= X86_CR4_PCIDE; \
__reserved_bits; \
})
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in);
+
#endif
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index c13e8c9ee926..80efd45a7761 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -10,10 +10,6 @@
#include <asm/mce.h>
#ifdef CONFIG_X86_MCE
-/*
- * See COPY_MC_TEST for self-test of the copy_mc_fragile()
- * implementation.
- */
static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key);
void enable_copy_mc_fragile(void)
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index 892d8915f609..e5f77e293034 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -2,14 +2,11 @@
/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */
#include <linux/linkage.h>
-#include <asm/copy_mc_test.h>
-#include <asm/export.h>
#include <asm/asm.h>
#ifndef CONFIG_UML
#ifdef CONFIG_X86_MCE
-COPY_MC_TEST_CTL
/*
* copy_mc_fragile - copy memory with indication if an exception / fault happened
@@ -38,8 +35,6 @@ SYM_FUNC_START(copy_mc_fragile)
subl %ecx, %edx
.L_read_leading_bytes:
movb (%rsi), %al
- COPY_MC_TEST_SRC %rsi 1 .E_leading_bytes
- COPY_MC_TEST_DST %rdi 1 .E_leading_bytes
.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
@@ -55,8 +50,6 @@ SYM_FUNC_START(copy_mc_fragile)
.L_read_words:
movq (%rsi), %r8
- COPY_MC_TEST_SRC %rsi 8 .E_read_words
- COPY_MC_TEST_DST %rdi 8 .E_write_words
.L_write_words:
movq %r8, (%rdi)
addq $8, %rsi
@@ -73,8 +66,6 @@ SYM_FUNC_START(copy_mc_fragile)
movl %edx, %ecx
.L_read_trailing_bytes:
movb (%rsi), %al
- COPY_MC_TEST_SRC %rsi 1 .E_trailing_bytes
- COPY_MC_TEST_DST %rdi 1 .E_trailing_bytes
.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
@@ -88,7 +79,6 @@ SYM_FUNC_START(copy_mc_fragile)
.L_done:
ret
SYM_FUNC_END(copy_mc_fragile)
-EXPORT_SYMBOL_GPL(copy_mc_fragile)
.section .fixup, "ax"
/*
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 58f7fb95c7f4..4229950a5d78 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -63,13 +63,12 @@ static bool is_string_insn(struct insn *insn)
*/
bool insn_has_rep_prefix(struct insn *insn)
{
+ insn_byte_t p;
int i;
insn_get_prefixes(insn);
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- insn_byte_t p = insn->prefixes.bytes[i];
-
+ for_each_insn_prefix(insn, i, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}
@@ -95,14 +94,15 @@ static int get_seg_reg_override_idx(struct insn *insn)
{
int idx = INAT_SEG_REG_DEFAULT;
int num_overrides = 0, i;
+ insn_byte_t p;
insn_get_prefixes(insn);
/* Look for any segment override prefixes. */
- for (i = 0; i < insn->prefixes.nbytes; i++) {
+ for_each_insn_prefix(insn, i, p) {
insn_attr_t attr;
- attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ attr = inat_get_opcode_attribute(p);
switch (attr) {
case INAT_MAKE_PREFIX(INAT_PFX_CS):
idx = INAT_SEG_REG_CS;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 037faac46b0c..1e299ac73c86 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -16,8 +16,6 @@
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
-.weak memcpy
-
/*
* memcpy - Copy a memory block.
*
@@ -30,7 +28,7 @@
* rax original destination
*/
SYM_FUNC_START_ALIAS(__memcpy)
-SYM_FUNC_START_LOCAL(memcpy)
+SYM_FUNC_START_WEAK(memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 7ff00ea64e4f..41902fe8b859 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,9 +24,7 @@
* Output:
* rax: dest
*/
-.weak memmove
-
-SYM_FUNC_START_ALIAS(memmove)
+SYM_FUNC_START_WEAK(memmove)
SYM_FUNC_START(__memmove)
mov %rdi, %rax
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9ff15ee404a4..0bfd26e4ca9e 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,8 +6,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
-.weak memset
-
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -19,7 +17,7 @@
*
* rax original destination
*/
-SYM_FUNC_START_ALIAS(memset)
+SYM_FUNC_START_WEAK(memset)
SYM_FUNC_START(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index 4321fa02e18d..419365c48b2a 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -26,6 +26,16 @@
#include <asm/fpu/api.h>
#include <asm/asm.h>
+/*
+ * Use KFPU_387. MMX instructions are not affected by MXCSR,
+ * but both AMD and Intel documentation states that even integer MMX
+ * operations will result in #MF if an exception is pending in FCW.
+ *
+ * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
+ * any subsequent user of the 387 stack will reinitialize it using
+ * KFPU_387.
+ */
+
void *_mmx_memcpy(void *to, const void *from, size_t len)
{
void *p;
@@ -37,7 +47,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
p = to;
i = len >> 6; /* len/64 */
- kernel_fpu_begin();
+ kernel_fpu_begin_mask(KFPU_387);
__asm__ __volatile__ (
"1: prefetch (%0)\n" /* This set is 28 bytes */
@@ -127,7 +137,7 @@ static void fast_clear_page(void *page)
{
int i;
- kernel_fpu_begin();
+ kernel_fpu_begin_mask(KFPU_387);
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
@@ -160,7 +170,7 @@ static void fast_copy_page(void *to, void *from)
{
int i;
- kernel_fpu_begin();
+ kernel_fpu_begin_mask(KFPU_387);
/*
* maybe the prefetch stuff can go before the expensive fnsave...
@@ -247,7 +257,7 @@ static void fast_clear_page(void *page)
{
int i;
- kernel_fpu_begin();
+ kernel_fpu_begin_mask(KFPU_387);
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
@@ -282,7 +292,7 @@ static void fast_copy_page(void *to, void *from)
{
int i;
- kernel_fpu_begin();
+ kernel_fpu_begin_mask(KFPU_387);
__asm__ __volatile__ (
"1: prefetch (%0)\n"
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index fee8b9c0520c..75a0915b0d01 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info)
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
struct msr_info_completion rv;
- call_single_data_t csd = {
- .func = __rdmsr_safe_on_cpu,
- .info = &rv,
- };
+ call_single_data_t csd;
int err;
+ INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv);
+
memset(&rv, 0, sizeof(rv));
init_completion(&rv.done);
rv.msr.msr_no = msr_no;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 3f435d7fca5e..c3e8a62ca561 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -9,9 +9,23 @@
#include <asm/tlbflush.h>
-/*
- * We rely on the nested NMI work to allow atomic faults from the NMI path; the
- * nested NMI paths are careful to preserve CR2.
+/**
+ * copy_from_user_nmi - NMI safe copy from user
+ * @to: Pointer to the destination buffer
+ * @from: Pointer to a user space address of the current task
+ * @n: Number of bytes to copy
+ *
+ * Returns: The number of not copied bytes. 0 is success, i.e. all bytes copied
+ *
+ * Contrary to other copy_from_user() variants this function can be called
+ * from NMI context. Despite the name it is not restricted to be called
+ * from NMI context. It is safe to be called from any other context as
+ * well. It disables pagefaults across the copy which means a fault will
+ * abort the copy.
+ *
+ * For NMI context invocations this relies on the nested NMI work to allow
+ * atomic faults from the NMI path; the nested NMI paths are careful to
+ * preserve CR2.
*/
unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
@@ -27,7 +41,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
- * called form other contexts.
+ * called from other contexts.
*/
pagefault_disable();
ret = __copy_from_user_inatomic(to, from, n);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82bf37a5c9ec..525197381baa 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,7 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
-#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -25,11 +25,12 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
+#include <asm/vdso.h> /* fixup_vdso_exception() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -53,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
@@ -82,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
#ifdef CONFIG_X86_64
case 0x40:
/*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
@@ -109,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
}
}
+static bool is_amd_k8_pre_npt(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+ c->x86_vendor == X86_VENDOR_AMD &&
+ c->x86 == 0xf && c->x86_model < 0x40);
+}
+
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
@@ -116,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
+ /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+ if (!is_amd_k8_pre_npt())
+ return 0;
+
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
@@ -126,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
- return 0;
+ /*
+ * This code has historically always bailed out if IP points to a
+ * not-present page (e.g. due to a race). No one has ever
+ * complained about this.
+ */
+ pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
- if (get_kernel_nofault(opcode, instr))
- break;
+ if (user_mode(regs)) {
+ if (get_user(opcode, instr))
+ break;
+ } else {
+ if (get_kernel_nofault(opcode, instr))
+ break;
+ }
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+ pagefault_enable();
return prefetch;
}
@@ -261,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-/*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-#ifdef CONFIG_VM86
- unsigned long bit;
-
- if (!v8086_mode(regs) || !tsk->thread.vm86)
- return;
-
- bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.vm86->screen_bitmap |= 1 << bit;
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
@@ -334,15 +336,6 @@ KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif
-/*
- * No vm86 mode in 64-bit mode:
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-}
-
static int bad_address(void *p)
{
unsigned long dummy;
@@ -426,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
|| boot_cpu_data.x86 != 0xf)
return 0;
+ if (user_mode(regs))
+ return 0;
+
if (address != regs->ip)
return 0;
@@ -461,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
}
/* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+ idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -602,11 +600,9 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
oops_end(flags, regs, sig);
}
-static void set_signal_archinfo(unsigned long address,
- unsigned long error_code)
+static void sanitize_error_code(unsigned long address,
+ unsigned long *error_code)
{
- struct task_struct *tsk = current;
-
/*
* To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to
@@ -617,7 +613,13 @@ static void set_signal_archinfo(unsigned long address,
* information and does not appear to cause any problems.
*/
if (address >= TASK_SIZE_MAX)
- error_code |= X86_PF_PROT;
+ *error_code |= X86_PF_PROT;
+}
+
+static void set_signal_archinfo(unsigned long address,
+ unsigned long error_code)
+{
+ struct task_struct *tsk = current;
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | X86_PF_USER;
@@ -625,51 +627,20 @@ static void set_signal_archinfo(unsigned long address,
}
static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- struct task_struct *tsk = current;
unsigned long flags;
int sig;
if (user_mode(regs)) {
/*
- * This is an implicit supervisor-mode access from user
- * mode. Bypass all the kernel-mode recovery code and just
- * OOPS.
+ * Implicit kernel access from user mode? Skip the stack
+ * overflow and EFI special cases.
*/
goto oops;
}
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- set_signal_archinfo(address, error_code);
-
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
- return;
- }
-
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
@@ -677,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -702,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code,
#endif
/*
- * 32-bit:
- *
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * 64-bit:
- *
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, error_code, address))
- return;
-
- if (is_errata93(regs, address))
- return;
-
- /*
- * Buggy firmware could access regions which might page fault, try to
- * recover from such faults.
+ * Buggy firmware could access regions which might page fault. If
+ * this happens, EFI has a special OOPS path that will try to
+ * avoid hanging the system.
*/
if (IS_ENABLED(CONFIG_EFI))
- efi_recover_from_page_fault(address);
+ efi_crash_gracefully_on_page_fault(address);
oops:
/*
@@ -734,7 +689,7 @@ oops:
show_fault_oops(regs, error_code, address);
- if (task_stack_end_corrupted(tsk))
+ if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL;
@@ -747,6 +702,53 @@ oops:
oops_end(flags, regs, sig);
}
+static noinline void
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ WARN_ON_ONCE(user_mode(regs));
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ sanitize_error_code(address, &error_code);
+
+ set_signal_archinfo(address, error_code);
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+ * instruction.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+}
+
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
@@ -789,50 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
{
struct task_struct *tsk = current;
- /* User mode accesses just cause a SIGSEGV */
- if (user_mode(regs) && (error_code & X86_PF_USER)) {
- /*
- * It's possible to have interrupts off here:
- */
- local_irq_enable();
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+ return;
+ }
- /*
- * Valid to do another page fault here because this one came
- * from user space:
- */
- if (is_prefetch(regs, error_code, address))
- return;
+ if (!(error_code & X86_PF_USER)) {
+ /* Implicit user access to kernel memory -- just oops */
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
- if (is_errata100(regs, address))
- return;
+ /*
+ * User mode accesses just cause a SIGSEGV.
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
- /*
- * To avoid leaking information about the kernel page table
- * layout, pretend that user-mode accesses to kernel addresses
- * are always protection faults.
- */
- if (address >= TASK_SIZE_MAX)
- error_code |= X86_PF_PROT;
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
- if (likely(show_unhandled_signals))
- show_signal_msg(regs, error_code, address, tsk);
+ if (is_errata100(regs, address))
+ return;
- set_signal_archinfo(address, error_code);
+ sanitize_error_code(address, &error_code);
- if (si_code == SEGV_PKUERR)
- force_sig_pkuerr((void __user *)address, pkey);
+ if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+ return;
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
- local_irq_disable();
+ set_signal_archinfo(address, error_code);
- return;
- }
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
- if (is_f00f_bug(regs, address))
- return;
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
- no_context(regs, error_code, address, SIGSEGV, si_code);
+ local_irq_disable();
}
static noinline void
@@ -922,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault)
{
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -931,6 +932,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
if (is_prefetch(regs, error_code, address))
return;
+ sanitize_error_code(address, &error_code);
+
+ if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+ return;
+
set_signal_archinfo(address, error_code);
#ifdef CONFIG_MEMORY_FAILURE
@@ -952,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, vm_fault_t fault)
-{
- if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, 0, 0);
- return;
- }
-
- if (fault & VM_FAULT_OOM) {
- /* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
- return;
- }
-
- /*
- * We ran out of memory, call the OOM killer, and return the
- * userspace (which will retry the fault, or kill us if we got
- * oom-killed):
- */
- pagefault_out_of_memory();
- } else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
- VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
- else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
- else
- BUG();
- }
-}
-
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1102,6 +1074,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
return 1;
/*
+ * SGX hardware blocked the access. This usually happens
+ * when the enclave memory contents have been destroyed, like
+ * after a suspend/resume cycle. In any case, the kernel can't
+ * fix the cause of the fault. Handle the fault as an access
+ * error even in cases where no actual access violation
+ * occurred. This allows userspace to rebuild the enclave in
+ * response to the signal.
+ */
+ if (unlikely(error_code & X86_PF_SGX))
+ return 1;
+
+ /*
* Make sure to check the VMA so that we do not perform
* faults just to hit a X86_PF_PK as soon as we fill in a
* page.
@@ -1188,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
#endif
+ if (is_f00f_bug(regs, hw_error_code, address))
+ return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1208,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space. Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
static inline
void do_user_addr_fault(struct pt_regs *regs,
- unsigned long hw_error_code,
+ unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
@@ -1223,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
+ if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
+ /*
+ * Whoops, this is kernel mode code trying to execute from
+ * user memory. Unless this is AMD erratum #93, which
+ * corrupts RIP such that it looks like a user address,
+ * this is unrecoverable. Don't even try to look up the
+ * VMA or look for extable entries.
+ */
+ if (is_errata93(regs, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
@@ -1231,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
- if (unlikely(hw_error_code & X86_PF_RSVD))
- pgtable_bad(regs, hw_error_code, address);
+ if (unlikely(error_code & X86_PF_RSVD))
+ pgtable_bad(regs, error_code, address);
/*
* If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1242,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs,
* enforcement appears to be consistent with the USER bit.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
- !(hw_error_code & X86_PF_USER) &&
- !(regs->flags & X86_EFLAGS_AC)))
- {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ !(error_code & X86_PF_USER) &&
+ !(regs->flags & X86_EFLAGS_AC))) {
+ /*
+ * No extable entry here. This was a kernel access to an
+ * invalid pointer. get_kernel_nofault() will not get here.
+ */
+ page_fault_oops(regs, error_code, address);
return;
}
@@ -1254,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
@@ -1275,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (hw_error_code & X86_PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (hw_error_code & X86_PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1293,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(hw_error_code, regs, address))
+ if (emulate_vsyscall(error_code, regs, address))
return;
}
#endif
@@ -1316,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
@@ -1332,17 +1344,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
@@ -1351,8 +1363,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(hw_error_code, vma))) {
- bad_area_access_error(regs, hw_error_code, address, vma);
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1371,11 +1383,14 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags, regs);
- /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+ /*
+ * Quick path to respond to signals. The core mm code
+ * has unlocked the mm for us if we get here.
+ */
if (!user_mode(regs))
- no_context(regs, hw_error_code, address, SIGBUS,
- BUS_ADRERR);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR);
return;
}
@@ -1391,12 +1406,37 @@ good_area:
}
mmap_read_unlock(mm);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, hw_error_code, address, fault);
+ if (likely(!(fault & VM_FAULT_ERROR)))
+ return;
+
+ if (fatal_signal_pending(current) && !user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
- check_v8086_mode(regs, address, tsk);
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
+ else
+ BUG();
+ }
}
NOKPROBE_SYMBOL(do_user_addr_fault);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 075fe51317b0..2c54b76d8f84 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,65 +4,6 @@
#include <linux/swap.h> /* for totalram_pages */
#include <linux/memblock.h>
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
- unsigned long vaddr;
- int idx, type;
-
- type = kmap_atomic_idx_push();
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- BUG_ON(!pte_none(*(kmap_pte-idx)));
- set_pte(kmap_pte-idx, mk_pte(page, prot));
- arch_flush_lazy_mmu_mode();
-
- return (void *)vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_high_prot);
-
-/*
- * This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn)
-{
- return kmap_atomic_prot_pfn(pfn, kmap_prot);
-}
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-
-void kunmap_atomic_high(void *kvaddr)
-{
- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-
- if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
- int idx, type;
-
- type = kmap_atomic_idx();
- idx = type + KM_TYPE_NR * smp_processor_id();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-#endif
- /*
- * Force other mappings to Oops if they'll try to access this
- * pte without first remap it. Keeping stale mappings around
- * is a bad idea also, in case the page changes cacheability
- * attributes or becomes a protected page in a hypervisor.
- */
- kpte_clear_flush(kmap_pte-idx, vaddr);
- kmap_atomic_idx_pop();
- arch_flush_lazy_mmu_mode();
- }
-#ifdef CONFIG_DEBUG_HIGHMEM
- else {
- BUG_ON(vaddr < PAGE_OFFSET);
- BUG_ON(vaddr >= (unsigned long)high_memory);
- }
-#endif
-}
-EXPORT_SYMBOL(kunmap_atomic_high);
-
void __init set_highmem_pages_init(void)
{
struct zone *zone;
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index fe7a12599d8e..968d7005f4a7 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -62,6 +62,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
+ int result;
for (; addr < end; addr = next) {
p4d_t *p4d = p4d_page + p4d_index(addr);
@@ -73,13 +74,20 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, 0);
- ident_pud_init(info, pud, addr, next);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
return -ENOMEM;
- ident_pud_init(info, pud, addr, next);
+
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c7a47603537f..dd694fb93916 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num)
}
/*
- * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
- * With KASLR memory randomization, depending on the machine e820 memory
- * and the PUD alignment. We may need twice more pages when KASLR memory
+ * By default need to be able to allocate page tables below PGD firstly for
+ * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
+ * With KASLR memory randomization, depending on the machine e820 memory and the
+ * PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled.
*/
+
+#ifndef CONFIG_X86_5LEVEL
+#define INIT_PGD_PAGE_TABLES 3
+#else
+#define INIT_PGD_PAGE_TABLES 4
+#endif
+
#ifndef CONFIG_RANDOMIZE_MEMORY
-#define INIT_PGD_PAGE_COUNT 6
+#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else
-#define INIT_PGD_PAGE_COUNT 12
+#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif
+
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
@@ -596,7 +605,7 @@ static unsigned long __init get_new_step_size(unsigned long step_size)
static void __init memory_map_top_down(unsigned long map_start,
unsigned long map_end)
{
- unsigned long real_end, start, last_start;
+ unsigned long real_end, last_start;
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
@@ -609,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start,
step_size = PMD_SIZE;
max_pfn_mapped = 0; /* will get exact value next */
min_pfn_mapped = real_end >> PAGE_SHIFT;
- last_start = start = real_end;
+ last_start = real_end;
/*
* We start from the top (end of memory) and go to the bottom.
@@ -618,6 +627,8 @@ static void __init memory_map_top_down(unsigned long map_start,
* for page table.
*/
while (last_start > map_start) {
+ unsigned long start;
+
if (last_start > step_size) {
start = round_down(last_start - 1, step_size);
if (start < map_start)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7c055259de3a..da31c2635ee4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -394,19 +394,6 @@ repeat:
return last_map_addr;
}
-pte_t *kmap_pte;
-
-static void __init kmap_init(void)
-{
- unsigned long kmap_vstart;
-
- /*
- * Cache the first kmap pte:
- */
- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
- kmap_pte = virt_to_kpte(kmap_vstart);
-}
-
#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
@@ -712,8 +699,6 @@ void __init paging_init(void)
__flush_tlb_all();
- kmap_init();
-
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index f60398aeb644..9aaa756ddf21 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size)
}
EXPORT_SYMBOL_GPL(iomap_free);
-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
-{
- unsigned long vaddr;
- int idx, type;
-
- preempt_disable();
- pagefault_disable();
-
- type = kmap_atomic_idx_push();
- idx = type + KM_TYPE_NR * smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
- arch_flush_lazy_mmu_mode();
-
- return (void *)vaddr;
-}
-
-/*
- * Map 'pfn' using protections 'prot'
- */
-void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
{
/*
* For non-PAT systems, translate non-WB request to UC- just in
@@ -81,36 +60,6 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
/* Filter out unsupported __PAGE_KERNEL* bits: */
pgprot_val(prot) &= __default_kernel_pte_mask;
- return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
-}
-EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
-
-void
-iounmap_atomic(void __iomem *kvaddr)
-{
- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-
- if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
- int idx, type;
-
- type = kmap_atomic_idx();
- idx = type + KM_TYPE_NR * smp_processor_id();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-#endif
- /*
- * Force other mappings to Oops if they'll try to access this
- * pte without first remap it. Keeping stale mappings around
- * is a bad idea also, in case the page changes cacheability
- * attributes or becomes a protected page in a hypervisor.
- */
- kpte_clear_flush(kmap_pte-idx, vaddr);
- kmap_atomic_idx_pop();
- }
-
- pagefault_enable();
- preempt_enable();
+ return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot);
}
-EXPORT_SYMBOL_GPL(iounmap_atomic);
+EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index efbb3de472df..4b01f7dbaf30 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -39,6 +39,7 @@
*/
u64 sme_me_mask __section(".data") = 0;
u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
@@ -197,6 +198,37 @@ void __init sme_early_init(void)
swiotlb_force = SWIOTLB_FORCE;
}
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!sev_active())
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
{
pgprot_t old_prot, new_prot;
@@ -350,6 +382,7 @@ bool sev_active(void)
{
return sev_status & MSR_AMD64_SEV_ENABLED;
}
+EXPORT_SYMBOL_GPL(sev_active);
/* Needs to be called from non-instrumentable code */
bool noinstr sev_es_active(void)
@@ -442,9 +475,10 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, we need to unroll the rep string I/O instructions.
+ * With SEV, we need to unroll the rep string I/O instructions,
+ * but SEV-ES supports them through the #VC handler.
*/
- if (sev_active())
+ if (sev_active() && !sev_es_active())
static_branch_enable(&sev_enable_key);
print_mem_encrypt_feature_info();
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 733b983f3a89..6c5eb6f3f14f 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -45,8 +45,8 @@
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
-#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
+ (_PAGE_PAT_LARGE | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index bd7aff5c51f7..cd768dafca9e 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -10,8 +10,6 @@
#define pr_fmt(fmt) "mmiotrace: " fmt
-#define DEBUG 1
-
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 44148691d78b..5eb4dc2b97da 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -938,6 +938,7 @@ int phys_to_target_node(phys_addr_t start)
return meminfo_to_nid(&numa_reserved_meminfo, start);
}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
int memory_add_physaddr_to_nid(u64 start)
{
@@ -947,4 +948,5 @@ int memory_add_physaddr_to_nid(u64 start)
nid = numa_meminfo.blk[0].nid;
return nid;
}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 40baa90e74f4..16f878c26667 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2194,6 +2194,7 @@ int set_direct_map_default_noflush(struct page *page)
return __set_pages_p(page, 1);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
@@ -2225,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
arch_flush_lazy_mmu_mode();
}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
-#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
{
unsigned int level;
@@ -2238,7 +2239,6 @@ bool kernel_page_present(struct page *page)
pte = lookup_address((unsigned long)page_address(page), &level);
return (pte_val(*pte) & _PAGE_PRESENT);
}
-#endif /* CONFIG_HIBERNATION */
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfd82f51ba66..f6a9e2e36642 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -829,6 +829,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
}
free_page((unsigned long)pmd_sv);
+
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
free_page((unsigned long)pmd);
return 1;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 11666ba19b62..569ac1d57f55 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* The membarrier system call requires a full memory barrier and
* core serialization before returning to user-space, after
- * storing to rq->curr. Writing to CR3 provides that full
- * memory barrier and core serializing instruction.
+ * storing to rq->curr, when changing mm. This is because
+ * membarrier() sends IPIs to all CPUs that are in the target mm
+ * to make them issue memory barriers. However, if another CPU
+ * switches to/from the target mm concurrently with
+ * membarrier(), it can cause that CPU not to receive an IPI
+ * when it really should issue a memory barrier. Writing to CR3
+ * provides that full memory barrier and core serializing
+ * instruction.
*/
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 796506dcfc42..79e7a0ec1da5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
+/* Some 1-byte opcodes for binary ALU operations */
+static u8 simple_alu_opcodes[] = {
+ [BPF_ADD] = 0x01,
+ [BPF_SUB] = 0x29,
+ [BPF_AND] = 0x21,
+ [BPF_OR] = 0x09,
+ [BPF_XOR] = 0x31,
+ [BPF_LSH] = 0xE0,
+ [BPF_RSH] = 0xE8,
+ [BPF_ARSH] = 0xF8,
+};
+
static void jit_fill_hole(void *area, unsigned int size)
{
/* Fill whole space with INT3 instructions */
@@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
+static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is_imm8(off)) {
+ /* 1-byte signed displacement.
+ *
+ * If off == 0 we could skip this and save one extra byte, but
+ * special case of x86 R13 which always needs an offset is not
+ * worth the hassle
+ */
+ EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
+ } else {
+ /* 4-byte signed displacement */
+ EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
+ }
+ *pprog = prog;
+}
+
+/*
+ * Emit a REX byte if it will be necessary to address these registers
+ */
+static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ *pprog = prog;
+}
+
/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
break;
}
- /*
- * If insn->off == 0 we can save one extra byte, but
- * special case of x86 R13 which always needs an offset
- * is not worth the hassle
- */
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
+ emit_insn_suffix(&prog, src_reg, dst_reg, off);
*pprog = prog;
}
@@ -751,11 +791,51 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
break;
}
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+ *pprog = prog;
+}
+
+static int emit_atomic(u8 **pprog, u8 atomic_op,
+ u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ EMIT1(0xF0); /* lock prefix */
+
+ maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
+
+ /* emit opcode */
+ switch (atomic_op) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
+ EMIT1(simple_alu_opcodes[atomic_op]);
+ break;
+ case BPF_ADD | BPF_FETCH:
+ /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
+ EMIT2(0x0F, 0xC1);
+ break;
+ case BPF_XCHG:
+ /* src_reg = atomic_xchg(dst_reg + off, src_reg); */
+ EMIT1(0x87);
+ break;
+ case BPF_CMPXCHG:
+ /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
+ EMIT2(0x0F, 0xB1);
+ break;
+ default:
+ pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+ return -EFAULT;
+ }
+
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+
*pprog = prog;
+ return 0;
}
static bool ex_handler_bpf(const struct exception_table_entry *x,
@@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
+static int emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen, cnt = 0;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(ideal_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+
+ return cnt;
+}
+
+#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
- int oldproglen, struct jit_context *ctx)
+ int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
struct bpf_insn *insn = bpf_prog->insnsi;
@@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0, excnt = 0;
- int proglen = 0;
+ int ilen, proglen = 0;
u8 *prog = temp;
+ int err;
detect_reg_usage(insn, insn_cnt, callee_regs_used,
&tail_call_seen);
@@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_prog->aux->func_idx != 0);
push_callee_regs(&prog, callee_regs_used);
- addrs[0] = prog - temp;
+
+ ilen = prog - temp;
+ if (image)
+ memcpy(image + proglen, temp, ilen);
+ proglen += ilen;
+ addrs[0] = proglen;
+ prog = temp;
for (i = 1; i <= insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
u8 b2 = 0, b3 = 0;
+ u8 *start_of_ldx;
s64 jmp_offset;
u8 jmp_cond;
- int ilen;
u8 *func;
+ int nops;
switch (insn->code) {
/* ALU */
@@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_AND | BPF_X:
case BPF_ALU64 | BPF_OR | BPF_X:
case BPF_ALU64 | BPF_XOR | BPF_X:
- switch (BPF_OP(insn->code)) {
- case BPF_ADD: b2 = 0x01; break;
- case BPF_SUB: b2 = 0x29; break;
- case BPF_AND: b2 = 0x21; break;
- case BPF_OR: b2 = 0x09; break;
- case BPF_XOR: b2 = 0x31; break;
- }
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
+ b2 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
break;
@@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
-
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
if (imm32 == 1)
EMIT2(0xD1, add_1reg(b3, dst_reg));
else
@@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
if (src_reg != BPF_REG_4)
@@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ /* test src_reg, src_reg */
+ maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
+ EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
+ /* jne start_of_ldx */
+ EMIT2(X86_JNE, 0);
+ /* xor dst_reg, dst_reg */
+ emit_mov_imm32(&prog, false, dst_reg, 0);
+ /* jmp byte_after_ldx */
+ EMIT2(0xEB, 0);
+
+ /* populate jmp_offset for JNE above */
+ temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ start_of_ldx = prog;
+ }
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
s64 delta;
+ /* populate jmp_offset for JMP above */
+ start_of_ldx[-1] = prog - start_of_ldx;
+
if (!bpf_prog->aux->extable)
break;
@@ -1230,21 +1342,56 @@ st: if (is_imm8(insn->off))
}
break;
- /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
- case BPF_STX | BPF_XADD | BPF_W:
- /* Emit 'lock add dword ptr [rax + off], eax' */
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
- else
- EMIT2(0xF0, 0x01);
- goto xadd;
- case BPF_STX | BPF_XADD | BPF_DW:
- EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
-xadd: if (is_imm8(insn->off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
- insn->off);
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ if (insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH)) {
+ u8 *branch_target;
+ bool is64 = BPF_SIZE(insn->code) == BPF_DW;
+
+ /*
+ * Can't be implemented with a single x86 insn.
+ * Need to do a CMPXCHG loop.
+ */
+
+ /* Will need RAX as a CMPXCHG operand so save R0 */
+ emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
+ branch_target = prog;
+ /* Load old value */
+ emit_ldx(&prog, BPF_SIZE(insn->code),
+ BPF_REG_0, dst_reg, insn->off);
+ /*
+ * Perform the (commutative) operation locally,
+ * put the result in the AUX_REG.
+ */
+ emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
+ maybe_emit_mod(&prog, AUX_REG, src_reg, is64);
+ EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
+ add_2reg(0xC0, AUX_REG, src_reg));
+ /* Attempt to swap in new value */
+ err = emit_atomic(&prog, BPF_CMPXCHG,
+ dst_reg, AUX_REG, insn->off,
+ BPF_SIZE(insn->code));
+ if (WARN_ON(err))
+ return err;
+ /*
+ * ZF tells us whether we won the race. If it's
+ * cleared we need to try again.
+ */
+ EMIT2(X86_JNE, -(prog - branch_target) - 2);
+ /* Return the pre-modification value */
+ emit_mov_reg(&prog, is64, src_reg, BPF_REG_0);
+ /* Restore R0 after clobbering RAX */
+ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+ break;
+
+ }
+
+ err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
+ insn->off, BPF_SIZE(insn->code));
+ if (err)
+ return err;
break;
/* call */
@@ -1295,20 +1442,16 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSGE | BPF_X:
case BPF_JMP32 | BPF_JSLE | BPF_X:
/* cmp dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_X:
case BPF_JMP32 | BPF_JSET | BPF_X:
/* test dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
@@ -1344,10 +1487,8 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSLE | BPF_K:
/* test dst_reg, dst_reg to save one extra byte */
if (imm32 == 0) {
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ maybe_emit_mod(&prog, dst_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
goto emit_cond_jmp;
}
@@ -1409,6 +1550,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
}
jmp_offset = addrs[i + insn->off] - addrs[i];
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To keep the jmp_offset valid, the extra bytes are
+ * padded before the jump insn, so we substract the
+ * 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp_cond, then this BPF insn won't shrink, so
+ * "nops" is 0.
+ *
+ * On the other hand, if the previous pass emits an
+ * imm32 jmp_cond, the extra 4 bytes(*) is padded to
+ * keep the image from shrinking further.
+ *
+ * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
+ * is 2 bytes, so the size difference is 4 bytes.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 4) {
+ pr_err("unexpected jmp_cond padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
EMIT2(jmp_cond, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
@@ -1431,11 +1596,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
else
jmp_offset = addrs[i + insn->off] - addrs[i];
- if (!jmp_offset)
- /* Optimize out nop jumps */
+ if (!jmp_offset) {
+ /*
+ * If jmp_padding is enabled, the extra nops will
+ * be inserted. Otherwise, optimize out nop jumps.
+ */
+ if (jmp_padding) {
+ /* There are 3 possible conditions.
+ * (1) This BPF_JA is already optimized out in
+ * the previous run, so there is no need
+ * to pad any extra byte (0 byte).
+ * (2) The previous pass emits an imm8 jmp,
+ * so we pad 2 bytes to match the previous
+ * insn size.
+ * (3) Similarly, the previous pass emits an
+ * imm32 jmp, and 5 bytes is padded.
+ */
+ nops = INSN_SZ_DIFF;
+ if (nops != 0 && nops != 2 && nops != 5) {
+ pr_err("unexpected nop jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
break;
+ }
emit_jmp:
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To avoid breaking jmp_offset, the extra bytes
+ * are padded before the actual jmp insn, so
+ * 2 bytes is substracted from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp, there is nothing to pad (0 byte).
+ *
+ * If it emits an imm32 jmp (5 bytes) previously
+ * and now an imm8 jmp (2 bytes), then we pad
+ * (5 - 2 = 3) bytes to stop the image from
+ * shrinking further.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 3) {
+ pr_err("unexpected jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
+ }
EMIT2(0xEB, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT1_off32(0xE9, jmp_offset);
@@ -1531,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_prog *p, int stack_size, bool mod_ret)
{
u8 *prog = *pprog;
+ u8 *jmp_insn;
int cnt = 0;
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
- return -EINVAL;
- } else {
- if (emit_call(&prog, __bpf_prog_enter, prog))
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_enter_sleepable :
+ __bpf_prog_enter, prog))
return -EINVAL;
- /* remember prog start time returned by __bpf_prog_enter */
- emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
- }
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+
+ /* if (__bpf_prog_enter*(prog) == 0)
+ * goto skip_exec_of_prog;
+ */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
+ /* emit 2 nops that will be replaced with JE insn */
+ jmp_insn = prog;
+ emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1561,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
+ /* replace 2 nops with JE insn, since jmp target is known */
+ jmp_insn[0] = X86_JE;
+ jmp_insn[1] = prog - jmp_insn - 2;
+
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_exit_sleepable :
+ __bpf_prog_exit, prog))
return -EINVAL;
- } else {
- /* arg1: mov rdi, progs[i] */
- emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
- (u32) (long) p);
- /* arg2: mov rsi, rbx <- start time in nsec */
- emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
- if (emit_call(&prog, __bpf_prog_exit, prog))
- return -EINVAL;
- }
*pprog = prog;
return 0;
}
-static void emit_nops(u8 **pprog, unsigned int len)
-{
- unsigned int i, noplen;
- u8 *prog = *pprog;
- int cnt = 0;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(ideal_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
static void emit_align(u8 **pprog, u32 align)
{
u8 *target, *prog = *pprog;
@@ -1972,6 +2169,9 @@ struct x64_jit_data {
struct jit_context ctx;
};
+#define MAX_PASSES 20
+#define PADDING_PASSES (MAX_PASSES - 5)
+
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
@@ -1981,6 +2181,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
+ bool padding = false;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2017,6 +2218,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
+ padding = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
@@ -2042,8 +2244,10 @@ skip_init_addrs:
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
- for (pass = 0; pass < 20 || image; pass++) {
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ for (pass = 0; pass < MAX_PASSES || image; pass++) {
+ if (!padding && pass >= PADDING_PASSES)
+ padding = true;
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 96fde03aa987..d17b67c69f89 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2243,10 +2243,8 @@ emit_jmp:
return -EFAULT;
}
break;
- /* STX XADD: lock *(u32 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_W:
- /* STX XADD: lock *(u64 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_DW:
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
goto notyet;
case BPF_JMP | BPF_EXIT:
if (seen_exit) {
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
deleted file mode 100644
index 4d49b5a27025..000000000000
--- a/arch/x86/oprofile/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
- oprof.o cpu_buffer.o buffer_sync.o \
- event_buffer.o oprofile_files.o \
- oprofilefs.o oprofile_stats.o \
- timer_int.o nmi_timer_int.o )
-
-oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
- op_model_ppro.o op_model_p4.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
deleted file mode 100644
index a2488b6e27d6..000000000000
--- a/arch/x86/oprofile/backtrace.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * @file backtrace.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author David Smith
- */
-
-#include <linux/oprofile.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-
-#include <asm/ptrace.h>
-#include <asm/stacktrace.h>
-#include <asm/unwind.h>
-
-#ifdef CONFIG_COMPAT
-static struct stack_frame_ia32 *
-dump_user_backtrace_32(struct stack_frame_ia32 *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame_ia32 bufhead[2];
- struct stack_frame_ia32 *fp;
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= fp)
- return NULL;
-
- return fp;
-}
-
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame_ia32 *head;
-
- /* User process is IA32 */
- if (!current || !test_thread_flag(TIF_IA32))
- return 0;
-
- head = (struct stack_frame_ia32 *) regs->bp;
- while (depth-- && head)
- head = dump_user_backtrace_32(head);
-
- return 1;
-}
-
-#else
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- return 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame bufhead[2];
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= bufhead[0].next_frame)
- return NULL;
-
- return bufhead[0].next_frame;
-}
-
-void
-x86_backtrace(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
-
- if (!user_mode(regs)) {
- struct unwind_state state;
- unsigned long addr;
-
- if (!depth)
- return;
-
- oprofile_add_trace(regs->ip);
-
- if (!--depth)
- return;
-
- for (unwind_start(&state, current, regs, NULL);
- !unwind_done(&state); unwind_next_frame(&state)) {
- addr = unwind_get_return_address(&state);
- if (!addr)
- break;
-
- oprofile_add_trace(addr);
-
- if (!--depth)
- break;
- }
-
- return;
- }
-
- if (x86_backtrace_32(regs, depth))
- return;
-
- while (depth-- && head)
- head = dump_user_backtrace(head);
-}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
deleted file mode 100644
index 9e138d00ad36..000000000000
--- a/arch/x86/oprofile/init.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * @file init.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-
-/*
- * We support CPUs that have performance counters like the Pentium Pro
- * with the NMI mode driver.
- */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int op_nmi_init(struct oprofile_operations *ops);
-extern void op_nmi_exit(void);
-#else
-static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
-static void op_nmi_exit(void) { }
-#endif
-
-extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
- ops->backtrace = x86_backtrace;
- return op_nmi_init(ops);
-}
-
-void oprofile_arch_exit(void)
-{
- op_nmi_exit();
-}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
deleted file mode 100644
index a7a7677265b6..000000000000
--- a/arch/x86/oprofile/nmi_int.c
+++ /dev/null
@@ -1,780 +0,0 @@
-/**
- * @file nmi_int.c
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/oprofile.h>
-#include <linux/syscore_ops.h>
-#include <linux/slab.h>
-#include <linux/moduleparam.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-
-#include "op_counter.h"
-#include "op_x86_model.h"
-
-static struct op_x86_model_spec *model;
-static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
-static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-
-/* must be protected with get_online_cpus()/put_online_cpus(): */
-static int nmi_enabled;
-static int ctr_running;
-
-struct op_counter_config counter_config[OP_MAX_COUNTER];
-
-/* common functions */
-
-u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config)
-{
- u64 val = 0;
- u16 event = (u16)counter_config->event;
-
- val |= ARCH_PERFMON_EVENTSEL_INT;
- val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
- val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
- val |= (counter_config->unit_mask & 0xFF) << 8;
- counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_CMASK);
- val |= counter_config->extra;
- event &= model->event_mask ? model->event_mask : 0xFF;
- val |= event & 0xFF;
- val |= (u64)(event & 0x0F00) << 24;
-
- return val;
-}
-
-
-static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
-{
- if (ctr_running)
- model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
- else if (!nmi_enabled)
- return NMI_DONE;
- else
- model->stop(this_cpu_ptr(&cpu_msrs));
- return NMI_HANDLED;
-}
-
-static void nmi_cpu_save_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- rdmsrl(counters[i].addr, counters[i].saved);
- }
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- rdmsrl(controls[i].addr, controls[i].saved);
- }
-}
-
-static void nmi_cpu_start(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->start(msrs);
-}
-
-static int nmi_start(void)
-{
- get_online_cpus();
- ctr_running = 1;
- /* make ctr_running visible to the nmi handler: */
- smp_mb();
- on_each_cpu(nmi_cpu_start, NULL, 1);
- put_online_cpus();
- return 0;
-}
-
-static void nmi_cpu_stop(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->stop(msrs);
-}
-
-static void nmi_stop(void)
-{
- get_online_cpus();
- on_each_cpu(nmi_cpu_stop, NULL, 1);
- ctr_running = 0;
- put_online_cpus();
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static DEFINE_PER_CPU(int, switch_index);
-
-static inline int has_mux(void)
-{
- return !!model->switch_ctrl;
-}
-
-inline int op_x86_phys_to_virt(int phys)
-{
- return __this_cpu_read(switch_index) + phys;
-}
-
-inline int op_x86_virt_to_phys(int virt)
-{
- return virt % model->num_counters;
-}
-
-static void nmi_shutdown_mux(void)
-{
- int i;
-
- if (!has_mux())
- return;
-
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).multiplex);
- per_cpu(cpu_msrs, i).multiplex = NULL;
- per_cpu(switch_index, i) = 0;
- }
-}
-
-static int nmi_setup_mux(void)
-{
- size_t multiplex_size =
- sizeof(struct op_msr) * model->num_virt_counters;
- int i;
-
- if (!has_mux())
- return 1;
-
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).multiplex =
- kzalloc(multiplex_size, GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).multiplex)
- return 0;
- }
-
- return 1;
-}
-
-static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
-{
- int i;
- struct op_msr *multiplex = msrs->multiplex;
-
- if (!has_mux())
- return;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- if (counter_config[i].enabled) {
- multiplex[i].saved = -(u64)counter_config[i].count;
- } else {
- multiplex[i].saved = 0;
- }
- }
-
- per_cpu(switch_index, cpu) = 0;
-}
-
-static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- rdmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- wrmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_switch(void *dummy)
-{
- int cpu = smp_processor_id();
- int si = per_cpu(switch_index, cpu);
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_stop(NULL);
- nmi_cpu_save_mpx_registers(msrs);
-
- /* move to next set */
- si += model->num_counters;
- if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
- per_cpu(switch_index, cpu) = 0;
- else
- per_cpu(switch_index, cpu) = si;
-
- model->switch_ctrl(model, msrs);
- nmi_cpu_restore_mpx_registers(msrs);
-
- nmi_cpu_start(NULL);
-}
-
-
-/*
- * Quick check to see if multiplexing is necessary.
- * The check should be sufficient since counters are used
- * in ordre.
- */
-static int nmi_multiplex_on(void)
-{
- return counter_config[model->num_counters].count ? 0 : -EINVAL;
-}
-
-static int nmi_switch_event(void)
-{
- if (!has_mux())
- return -ENOSYS; /* not implemented */
- if (nmi_multiplex_on() < 0)
- return -EINVAL; /* not necessary */
-
- get_online_cpus();
- if (ctr_running)
- on_each_cpu(nmi_cpu_switch, NULL, 1);
- put_online_cpus();
-
- return 0;
-}
-
-static inline void mux_init(struct oprofile_operations *ops)
-{
- if (has_mux())
- ops->switch_events = nmi_switch_event;
-}
-
-static void mux_clone(int cpu)
-{
- if (!has_mux())
- return;
-
- memcpy(per_cpu(cpu_msrs, cpu).multiplex,
- per_cpu(cpu_msrs, 0).multiplex,
- sizeof(struct op_msr) * model->num_virt_counters);
-}
-
-#else
-
-inline int op_x86_phys_to_virt(int phys) { return phys; }
-inline int op_x86_virt_to_phys(int virt) { return virt; }
-static inline void nmi_shutdown_mux(void) { }
-static inline int nmi_setup_mux(void) { return 1; }
-static inline void
-nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
-static inline void mux_init(struct oprofile_operations *ops) { }
-static void mux_clone(int cpu) { }
-
-#endif
-
-static void free_msrs(void)
-{
- int i;
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).counters);
- per_cpu(cpu_msrs, i).counters = NULL;
- kfree(per_cpu(cpu_msrs, i).controls);
- per_cpu(cpu_msrs, i).controls = NULL;
- }
- nmi_shutdown_mux();
-}
-
-static int allocate_msrs(void)
-{
- size_t controls_size = sizeof(struct op_msr) * model->num_controls;
- size_t counters_size = sizeof(struct op_msr) * model->num_counters;
-
- int i;
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).counters)
- goto fail;
- per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).controls)
- goto fail;
- }
-
- if (!nmi_setup_mux())
- goto fail;
-
- return 1;
-
-fail:
- free_msrs();
- return 0;
-}
-
-static void nmi_cpu_setup(void)
-{
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_save_registers(msrs);
- raw_spin_lock(&oprofilefs_lock);
- model->setup_ctrs(model, msrs);
- nmi_cpu_setup_mux(cpu, msrs);
- raw_spin_unlock(&oprofilefs_lock);
- per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static void nmi_cpu_restore_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- wrmsrl(controls[i].addr, controls[i].saved);
- }
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- wrmsrl(counters[i].addr, counters[i].saved);
- }
-}
-
-static void nmi_cpu_shutdown(void)
-{
- unsigned int v;
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- /* restoring APIC_LVTPC can trigger an apic error because the delivery
- * mode and vector nr combination can be illegal. That's by design: on
- * power on apic lvt contain a zero vector nr which are legal only for
- * NMI delivery mode. So inhibit apic err before restoring lvtpc
- */
- v = apic_read(APIC_LVTERR);
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
- apic_write(APIC_LVTERR, v);
- nmi_cpu_restore_registers(msrs);
-}
-
-static int nmi_cpu_online(unsigned int cpu)
-{
- local_irq_disable();
- if (nmi_enabled)
- nmi_cpu_setup();
- if (ctr_running)
- nmi_cpu_start(NULL);
- local_irq_enable();
- return 0;
-}
-
-static int nmi_cpu_down_prep(unsigned int cpu)
-{
- local_irq_disable();
- if (ctr_running)
- nmi_cpu_stop(NULL);
- if (nmi_enabled)
- nmi_cpu_shutdown();
- local_irq_enable();
- return 0;
-}
-
-static int nmi_create_files(struct dentry *root)
-{
- unsigned int i;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- struct dentry *dir;
- char buf[4];
-
- /* quick little hack to _not_ expose a counter if it is not
- * available for use. This should protect userspace app.
- * NOTE: assumes 1:1 mapping here (that counters are organized
- * sequentially in their struct assignment).
- */
- if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
- continue;
-
- snprintf(buf, sizeof(buf), "%d", i);
- dir = oprofilefs_mkdir(root, buf);
- oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
- oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
- oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
- oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
- oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
- oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
- oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
- }
-
- return 0;
-}
-
-static enum cpuhp_state cpuhp_nmi_online;
-
-static int nmi_setup(void)
-{
- int err = 0;
- int cpu;
-
- if (!allocate_msrs())
- return -ENOMEM;
-
- /* We need to serialize save and setup for HT because the subset
- * of msrs are distinct for save and setup operations
- */
-
- /* Assume saved/restored counters are the same on all CPUs */
- err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
- if (err)
- goto fail;
-
- for_each_possible_cpu(cpu) {
- if (!IS_ENABLED(CONFIG_SMP) || !cpu)
- continue;
-
- memcpy(per_cpu(cpu_msrs, cpu).counters,
- per_cpu(cpu_msrs, 0).counters,
- sizeof(struct op_msr) * model->num_counters);
-
- memcpy(per_cpu(cpu_msrs, cpu).controls,
- per_cpu(cpu_msrs, 0).controls,
- sizeof(struct op_msr) * model->num_controls);
-
- mux_clone(cpu);
- }
-
- nmi_enabled = 0;
- ctr_running = 0;
- /* make variables visible to the nmi handler: */
- smp_mb();
- err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
- 0, "oprofile");
- if (err)
- goto fail;
-
- nmi_enabled = 1;
- /* make nmi_enabled visible to the nmi handler: */
- smp_mb();
- err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online",
- nmi_cpu_online, nmi_cpu_down_prep);
- if (err < 0)
- goto fail_nmi;
- cpuhp_nmi_online = err;
- return 0;
-fail_nmi:
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
-fail:
- free_msrs();
- return err;
-}
-
-static void nmi_shutdown(void)
-{
- struct op_msrs *msrs;
-
- cpuhp_remove_state(cpuhp_nmi_online);
- nmi_enabled = 0;
- ctr_running = 0;
-
- /* make variables visible to the nmi handler: */
- smp_mb();
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
- msrs = &get_cpu_var(cpu_msrs);
- model->shutdown(msrs);
- free_msrs();
- put_cpu_var(cpu_msrs);
-}
-
-#ifdef CONFIG_PM
-
-static int nmi_suspend(void)
-{
- /* Only one CPU left, just stop that one */
- if (nmi_enabled == 1)
- nmi_cpu_stop(NULL);
- return 0;
-}
-
-static void nmi_resume(void)
-{
- if (nmi_enabled == 1)
- nmi_cpu_start(NULL);
-}
-
-static struct syscore_ops oprofile_syscore_ops = {
- .resume = nmi_resume,
- .suspend = nmi_suspend,
-};
-
-static void __init init_suspend_resume(void)
-{
- register_syscore_ops(&oprofile_syscore_ops);
-}
-
-static void exit_suspend_resume(void)
-{
- unregister_syscore_ops(&oprofile_syscore_ops);
-}
-
-#else
-
-static inline void init_suspend_resume(void) { }
-static inline void exit_suspend_resume(void) { }
-
-#endif /* CONFIG_PM */
-
-static int __init p4_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
-
- if (cpu_model > 6 || cpu_model == 5)
- return 0;
-
-#ifndef CONFIG_SMP
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-#else
- switch (smp_num_siblings) {
- case 1:
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-
- case 2:
- *cpu_type = "i386/p4-ht";
- model = &op_p4_ht2_spec;
- return 1;
- }
-#endif
-
- printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
- printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
- return 0;
-}
-
-enum __force_cpu_type {
- reserved = 0, /* do not force */
- timer,
- arch_perfmon,
-};
-
-static int force_cpu_type;
-
-static int set_cpu_type(const char *str, const struct kernel_param *kp)
-{
- if (!strcmp(str, "timer")) {
- force_cpu_type = timer;
- printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
- } else if (!strcmp(str, "arch_perfmon")) {
- force_cpu_type = arch_perfmon;
- printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
- } else {
- force_cpu_type = 0;
- }
-
- return 0;
-}
-module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
-
-static int __init ppro_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
- struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
-
- if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return 0;
-
- /*
- * Documentation on identifying Intel processors by CPU family
- * and model can be found in the Intel Software Developer's
- * Manuals (SDM):
- *
- * http://www.intel.com/products/processor/manuals/
- *
- * As of May 2010 the documentation for this was in the:
- * "Intel 64 and IA-32 Architectures Software Developer's
- * Manual Volume 3B: System Programming Guide", "Table B-1
- * CPUID Signature Values of DisplayFamily_DisplayModel".
- */
- switch (cpu_model) {
- case 0 ... 2:
- *cpu_type = "i386/ppro";
- break;
- case 3 ... 5:
- *cpu_type = "i386/pii";
- break;
- case 6 ... 8:
- case 10 ... 11:
- *cpu_type = "i386/piii";
- break;
- case 9:
- case 13:
- *cpu_type = "i386/p6_mobile";
- break;
- case 14:
- *cpu_type = "i386/core";
- break;
- case 0x0f:
- case 0x16:
- case 0x17:
- case 0x1d:
- *cpu_type = "i386/core_2";
- break;
- case 0x1a:
- case 0x1e:
- case 0x2e:
- spec = &op_arch_perfmon_spec;
- *cpu_type = "i386/core_i7";
- break;
- case 0x1c:
- *cpu_type = "i386/atom";
- break;
- default:
- /* Unknown */
- return 0;
- }
-
- model = spec;
- return 1;
-}
-
-int __init op_nmi_init(struct oprofile_operations *ops)
-{
- __u8 vendor = boot_cpu_data.x86_vendor;
- __u8 family = boot_cpu_data.x86;
- char *cpu_type = NULL;
- int ret = 0;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return -ENODEV;
-
- if (force_cpu_type == timer)
- return -ENODEV;
-
- switch (vendor) {
- case X86_VENDOR_AMD:
- /* Needs to be at least an Athlon (or hammer in 32bit mode) */
-
- switch (family) {
- case 6:
- cpu_type = "i386/athlon";
- break;
- case 0xf:
- /*
- * Actually it could be i386/hammer too, but
- * give user space an consistent name.
- */
- cpu_type = "x86-64/hammer";
- break;
- case 0x10:
- cpu_type = "x86-64/family10";
- break;
- case 0x11:
- cpu_type = "x86-64/family11h";
- break;
- case 0x12:
- cpu_type = "x86-64/family12h";
- break;
- case 0x14:
- cpu_type = "x86-64/family14h";
- break;
- case 0x15:
- cpu_type = "x86-64/family15h";
- break;
- default:
- return -ENODEV;
- }
- model = &op_amd_spec;
- break;
-
- case X86_VENDOR_INTEL:
- switch (family) {
- /* Pentium IV */
- case 0xf:
- p4_init(&cpu_type);
- break;
-
- /* A P6-class processor */
- case 6:
- ppro_init(&cpu_type);
- break;
-
- default:
- break;
- }
-
- if (cpu_type)
- break;
-
- if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return -ENODEV;
-
- /* use arch perfmon as fallback */
- cpu_type = "i386/arch_perfmon";
- model = &op_arch_perfmon_spec;
- break;
-
- default:
- return -ENODEV;
- }
-
- /* default values, can be overwritten by model */
- ops->create_files = nmi_create_files;
- ops->setup = nmi_setup;
- ops->shutdown = nmi_shutdown;
- ops->start = nmi_start;
- ops->stop = nmi_stop;
- ops->cpu_type = cpu_type;
-
- if (model->init)
- ret = model->init(ops);
- if (ret)
- return ret;
-
- if (!model->num_virt_counters)
- model->num_virt_counters = model->num_counters;
-
- mux_init(ops);
-
- init_suspend_resume();
-
- printk(KERN_INFO "oprofile: using NMI interrupt.\n");
- return 0;
-}
-
-void op_nmi_exit(void)
-{
- exit_suspend_resume();
-}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
deleted file mode 100644
index 0b7b7b179cbe..000000000000
--- a/arch/x86/oprofile/op_counter.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * @file op_counter.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- */
-
-#ifndef OP_COUNTER_H
-#define OP_COUNTER_H
-
-#define OP_MAX_COUNTER 32
-
-/* Per-perfctr configuration as set via
- * oprofilefs.
- */
-struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
- unsigned long extra;
-};
-
-extern struct op_counter_config counter_config[];
-
-#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
deleted file mode 100644
index 660a83c8287b..000000000000
--- a/arch/x86/oprofile/op_model_amd.c
+++ /dev/null
@@ -1,542 +0,0 @@
-/*
- * @file op_model_amd.c
- * athlon / K7 / K8 / Family 10h model-specific MSR operations
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/device.h>
-#include <linux/pci.h>
-#include <linux/percpu.h>
-
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/nmi.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
-#else
-#define NUM_VIRT_COUNTERS 0
-#endif
-
-#define OP_EVENT_MASK 0x0FFF
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
-
-static int num_counters;
-static unsigned long reset_value[OP_MAX_COUNTER];
-
-#define IBS_FETCH_SIZE 6
-#define IBS_OP_SIZE 12
-
-static u32 ibs_caps;
-
-struct ibs_config {
- unsigned long op_enabled;
- unsigned long fetch_enabled;
- unsigned long max_cnt_fetch;
- unsigned long max_cnt_op;
- unsigned long rand_en;
- unsigned long dispatched_ops;
- unsigned long branch_target;
-};
-
-struct ibs_state {
- u64 ibs_op_ctl;
- int branch_target;
- unsigned long sample_size;
-};
-
-static struct ibs_config ibs_config;
-static struct ibs_state ibs_state;
-
-/*
- * IBS randomization macros
- */
-#define IBS_RANDOM_BITS 12
-#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
-#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
-
-/*
- * 16-bit Linear Feedback Shift Register (LFSR)
- *
- * 16 14 13 11
- * Feedback polynomial = X + X + X + X + 1
- */
-static unsigned int lfsr_random(void)
-{
- static unsigned int lfsr_value = 0xF00D;
- unsigned int bit;
-
- /* Compute next bit to shift in */
- bit = ((lfsr_value >> 0) ^
- (lfsr_value >> 2) ^
- (lfsr_value >> 3) ^
- (lfsr_value >> 5)) & 0x0001;
-
- /* Advance to next register value */
- lfsr_value = (lfsr_value >> 1) | (bit << 15);
-
- return lfsr_value;
-}
-
-/*
- * IBS software randomization
- *
- * The IBS periodic op counter is randomized in software. The lower 12
- * bits of the 20 bit counter are randomized. IbsOpCurCnt is
- * initialized with a 12 bit random value.
- */
-static inline u64 op_amd_randomize_ibs_op(u64 val)
-{
- unsigned int random = lfsr_random();
-
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
- /*
- * Work around if the hw can not write to IbsOpCurCnt
- *
- * Randomize the lower 8 bits of the 16 bit
- * IbsOpMaxCnt [15:0] value in the range of -128 to
- * +127 by adding/subtracting an offset to the
- * maximum count (IbsOpMaxCnt).
- *
- * To avoid over or underflows and protect upper bits
- * starting at bit 16, the initial value for
- * IbsOpMaxCnt must fit in the range from 0x0081 to
- * 0xff80.
- */
- val += (s8)(random >> 4);
- else
- val |= (u64)(random & IBS_RANDOM_MASK) << 32;
-
- return val;
-}
-
-static inline void
-op_amd_handle_ibs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val, ctl;
- struct op_entry entry;
-
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled) {
- rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- if (ctl & IBS_FETCH_VAL) {
- rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
- oprofile_write_reserve(&entry, regs, val,
- IBS_FETCH_CODE, IBS_FETCH_SIZE);
- oprofile_add_data64(&entry, val);
- oprofile_add_data64(&entry, ctl);
- rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
- oprofile_add_data64(&entry, val);
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
- ctl |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- }
- }
-
- if (ibs_config.op_enabled) {
- rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
- if (ctl & IBS_OP_VAL) {
- rdmsrl(MSR_AMD64_IBSOPRIP, val);
- oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
- ibs_state.sample_size);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA2, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA3, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCLINAD, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
- oprofile_add_data64(&entry, val);
- if (ibs_state.branch_target) {
- rdmsrl(MSR_AMD64_IBSBRTARGET, val);
- oprofile_add_data(&entry, (unsigned long)val);
- }
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
- }
- }
-}
-
-static inline void op_amd_start_ibs(void)
-{
- u64 val;
-
- if (!ibs_caps)
- return;
-
- memset(&ibs_state, 0, sizeof(ibs_state));
-
- /*
- * Note: Since the max count settings may out of range we
- * write back the actual used values so that userland can read
- * it.
- */
-
- if (ibs_config.fetch_enabled) {
- val = ibs_config.max_cnt_fetch >> 4;
- val = min(val, IBS_FETCH_MAX_CNT);
- ibs_config.max_cnt_fetch = val << 4;
- val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
- val |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
- }
-
- if (ibs_config.op_enabled) {
- val = ibs_config.max_cnt_op >> 4;
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
- /*
- * IbsOpCurCnt not supported. See
- * op_amd_randomize_ibs_op() for details.
- */
- val = clamp(val, 0x0081ULL, 0xFF80ULL);
- ibs_config.max_cnt_op = val << 4;
- } else {
- /*
- * The start value is randomized with a
- * positive offset, we need to compensate it
- * with the half of the randomized range. Also
- * avoid underflows.
- */
- val += IBS_RANDOM_MAXCNT_OFFSET;
- if (ibs_caps & IBS_CAPS_OPCNTEXT)
- val = min(val, IBS_OP_MAX_CNT_EXT);
- else
- val = min(val, IBS_OP_MAX_CNT);
- ibs_config.max_cnt_op =
- (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
- }
- val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
- val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
- val |= IBS_OP_ENABLE;
- ibs_state.ibs_op_ctl = val;
- ibs_state.sample_size = IBS_OP_SIZE;
- if (ibs_config.branch_target) {
- ibs_state.branch_target = 1;
- ibs_state.sample_size++;
- }
- val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, val);
- }
-}
-
-static void op_amd_stop_ibs(void)
-{
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
-
- if (ibs_config.op_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSOPCTL, 0);
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- if (num_counters == AMD64_NUM_COUNTERS_CORE) {
- msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
- msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
- } else {
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- }
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- op_amd_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* setup reset_value */
- for (i = 0; i < OP_MAX_COUNTER; ++i) {
- if (counter_config[i].enabled
- && msrs->counters[op_x86_virt_to_phys(i)].addr)
- reset_value[i] = counter_config[i].count;
- else
- reset_value[i] = 0;
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
-
- /* setup counter registers */
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
- /* setup control registers */
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-static int op_amd_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- /* bit is clear if overflowed: */
- if (val & OP_CTR_OVERFLOW)
- continue;
- oprofile_add_sample(regs, virt);
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
- }
-
- op_amd_handle_ibs(regs, msrs);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-static void op_amd_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_start_ibs();
-}
-
-static void op_amd_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /*
- * Subtle: stop on all counters to avoid race with setting our
- * pm callback
- */
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_stop_ibs();
-}
-
-/*
- * check and reserve APIC extended interrupt LVT offset for IBS if
- * available
- */
-
-static void init_ibs(void)
-{
- ibs_caps = get_ibs_caps();
-
- if (!ibs_caps)
- return;
-
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
-}
-
-static int (*create_arch_files)(struct dentry *root);
-
-static int setup_ibs_files(struct dentry *root)
-{
- struct dentry *dir;
- int ret = 0;
-
- /* architecture specific files */
- if (create_arch_files)
- ret = create_arch_files(root);
-
- if (ret)
- return ret;
-
- if (!ibs_caps)
- return ret;
-
- /* model specific files */
-
- /* setup some reasonable defaults */
- memset(&ibs_config, 0, sizeof(ibs_config));
- ibs_config.max_cnt_fetch = 250000;
- ibs_config.max_cnt_op = 250000;
-
- if (ibs_caps & IBS_CAPS_FETCHSAM) {
- dir = oprofilefs_mkdir(root, "ibs_fetch");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.fetch_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_fetch);
- oprofilefs_create_ulong(dir, "rand_enable",
- &ibs_config.rand_en);
- }
-
- if (ibs_caps & IBS_CAPS_OPSAM) {
- dir = oprofilefs_mkdir(root, "ibs_op");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.op_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_op);
- if (ibs_caps & IBS_CAPS_OPCNT)
- oprofilefs_create_ulong(dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
- if (ibs_caps & IBS_CAPS_BRNTRGT)
- oprofilefs_create_ulong(dir, "branch_target",
- &ibs_config.branch_target);
- }
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec;
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- init_ibs();
- create_arch_files = ops->create_files;
- ops->create_files = setup_ibs_files;
-
- if (boot_cpu_data.x86 == 0x15) {
- num_counters = AMD64_NUM_COUNTERS_CORE;
- } else {
- num_counters = AMD64_NUM_COUNTERS;
- }
-
- op_amd_spec.num_counters = num_counters;
- op_amd_spec.num_controls = num_counters;
- op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec = {
- /* num_counters/num_controls filled in at runtime */
- .reserved = MSR_AMD_EVENTSEL_RESERVED,
- .event_mask = OP_EVENT_MASK,
- .init = op_amd_init,
- .fill_in_addresses = &op_amd_fill_in_addresses,
- .setup_ctrs = &op_amd_setup_ctrs,
- .check_ctrs = &op_amd_check_ctrs,
- .start = &op_amd_start,
- .stop = &op_amd_stop,
- .shutdown = &op_amd_shutdown,
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- .switch_ctrl = &op_mux_switch_ctrl,
-#endif
-};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
deleted file mode 100644
index ad1d91f475ab..000000000000
--- a/arch/x86/oprofile/op_model_p4.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/**
- * @file op_model_p4.c
- * P4 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- */
-
-#include <linux/oprofile.h>
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#define NUM_EVENTS 39
-
-#define NUM_COUNTERS_NON_HT 8
-#define NUM_ESCRS_NON_HT 45
-#define NUM_CCCRS_NON_HT 18
-#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
-
-#define NUM_COUNTERS_HT2 4
-#define NUM_ESCRS_HT2 23
-#define NUM_CCCRS_HT2 9
-#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
-
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-static unsigned int num_counters = NUM_COUNTERS_NON_HT;
-static unsigned int num_controls = NUM_CONTROLS_NON_HT;
-
-/* this has to be checked dynamically since the
- hyper-threadedness of a chip is discovered at
- kernel boot-time. */
-static inline void setup_num_counters(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings == 2) {
- num_counters = NUM_COUNTERS_HT2;
- num_controls = NUM_CONTROLS_HT2;
- }
-#endif
-}
-
-static inline int addr_increment(void)
-{
-#ifdef CONFIG_SMP
- return smp_num_siblings == 2 ? 2 : 1;
-#else
- return 1;
-#endif
-}
-
-
-/* tables to simulate simplified hardware view of p4 registers */
-struct p4_counter_binding {
- int virt_counter;
- int counter_address;
- int cccr_address;
-};
-
-struct p4_event_binding {
- int escr_select; /* value to put in CCCR */
- int event_select; /* value to put in ESCR */
- struct {
- int virt_counter; /* for this counter... */
- int escr_address; /* use this ESCR */
- } bindings[2];
-};
-
-/* nb: these CTR_* defines are a duplicate of defines in
- event/i386.p4*events. */
-
-
-#define CTR_BPU_0 (1 << 0)
-#define CTR_MS_0 (1 << 1)
-#define CTR_FLAME_0 (1 << 2)
-#define CTR_IQ_4 (1 << 3)
-#define CTR_BPU_2 (1 << 4)
-#define CTR_MS_2 (1 << 5)
-#define CTR_FLAME_2 (1 << 6)
-#define CTR_IQ_5 (1 << 7)
-
-static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
- { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
- { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
- { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
- { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 },
- { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 },
- { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 },
- { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
- { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
-};
-
-#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
-
-/* p4 event codes in libop/op_event.h are indices into this table. */
-
-static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
- { /* BRANCH_RETIRED */
- 0x05, 0x06,
- { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
- {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* TC_DELIVER_MODE */
- 0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
- { CTR_MS_2, MSR_P4_TC_ESCR1} }
- },
-
- { /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
- { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
- { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
- },
-
- { /* ITLB_REFERENCE */
- 0x03, 0x18,
- { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
- { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
- },
-
- { /* MEMORY_CANCEL */
- 0x05, 0x02,
- { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
- { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
- },
-
- { /* MEMORY_COMPLETE */
- 0x02, 0x08,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* STORE_PORT_REPLAY */
- 0x02, 0x05,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* MOB_LOAD_REPLAY */
- 0x02, 0x03,
- { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
- { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
- },
-
- { /* PAGE_WALK_TYPE */
- 0x04, 0x01,
- { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
- { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
- },
-
- { /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
- },
-
- { /* IOQ_ALLOCATION */
- 0x06, 0x03,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { 0, 0 } }
- },
-
- { /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
- { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
- { 0, 0 } }
- },
-
- { /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* BSQ_ALLOCATION */
- 0x07, 0x05,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { 0, 0 } }
- },
-
- { /* BSQ_ACTIVE_ENTRIES */
- 0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
- { 0, 0 } }
- },
-
- { /* X87_ASSIST */
- 0x05, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* SSE_INPUT_ASSIST */
- 0x01, 0x34,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_SP_UOP */
- 0x01, 0x08,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_DP_UOP */
- 0x01, 0x0c,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_SP_UOP */
- 0x01, 0x0a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_DP_UOP */
- 0x01, 0x0e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 64BIT_MMX_UOP */
- 0x01, 0x02,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_FP_UOP */
- 0x01, 0x04,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* MACHINE_CLEAR */
- 0x05, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* GLOBAL_POWER_EVENTS */
- 0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* TC_MS_XFER */
- 0x00, 0x05,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* UOP_QUEUE_WRITES */
- 0x00, 0x09,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* FRONT_END_EVENT */
- 0x05, 0x08,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* EXECUTION_EVENT */
- 0x05, 0x0c,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* REPLAY_EVENT */
- 0x05, 0x09,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* INSTR_RETIRED */
- 0x04, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOPS_RETIRED */
- 0x04, 0x01,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOP_TYPE */
- 0x02, 0x02,
- { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
- { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
- },
-
- { /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- },
-
- { /* RETIRED_BRANCH_TYPE */
- 0x02, 0x04,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- }
-};
-
-
-#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
-
-#define ESCR_RESERVED_BITS 0x80000003
-#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
-#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
-#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
-#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
-#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
-#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
-#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-
-#define CCCR_RESERVED_BITS 0x38030FFF
-#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
-#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
-#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
-#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
-#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
-#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
-#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
-#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-
-
-/* this assigns a "stagger" to the current CPU, which is used throughout
- the code in this module as an extra array offset, to select the "even"
- or "odd" part of all the divided resources. */
-static unsigned int get_stagger(void)
-{
-#ifdef CONFIG_SMP
- int cpu = smp_processor_id();
- return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
-#endif
- return 0;
-}
-
-
-/* finally, mediate access to a real hardware counter
- by passing a "virtual" counter numer to this macro,
- along with your stagger setting. */
-#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
-
-static unsigned long reset_value[NUM_COUNTERS_NON_HT];
-
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(msrs->counters[i].addr);
- }
- /*
- * some of the control registers are specially reserved in
- * conjunction with the counter registers (hence the starting offset).
- * This saves a few bits.
- */
- for (i = num_counters; i < num_controls; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(msrs->controls[i].addr);
- }
-}
-
-static int p4_fill_in_addresses(struct op_msrs * const msrs)
-{
- unsigned int i;
- unsigned int addr, cccraddr, stag;
-
- setup_num_counters();
- stag = get_stagger();
-
- /* the counter & cccr registers we pay attention to */
- for (i = 0; i < num_counters; ++i) {
- addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
- cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)) {
- msrs->counters[i].addr = addr;
- msrs->controls[i].addr = cccraddr;
- }
- }
-
- /* 43 ESCR registers in three or four discontiguous group */
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
- * to avoid special case in nmi_{save|restore}_registers() */
- if (boot_cpu_data.x86_model >= 0x3) {
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- } else {
- for (addr = MSR_P4_IQ_ESCR0 + stag;
- addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- }
-
- for (addr = MSR_P4_RAT_ESCR0 + stag;
- addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* there are 2 remaining non-contiguously located ESCRs */
-
- if (num_counters == NUM_COUNTERS_NON_HT) {
- /* standard non-HT CPUs handle both remaining ESCRs*/
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else if (stag == 0) {
- /* HT CPUs give the first remainder to the even thread, as
- the 32nd control register */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else {
- /* and two copies of the second to the odd thread,
- for the 22st and 23nd control registers */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- }
- }
-
- for (i = 0; i < num_counters; ++i) {
- if (!counter_config[i].enabled)
- continue;
- if (msrs->controls[i].addr)
- continue;
- op_x86_warn_reserved(i);
- p4_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void pmc_setup_one_p4_counter(unsigned int ctr)
-{
- int i;
- int const maxbind = 2;
- unsigned int cccr = 0;
- unsigned int escr = 0;
- unsigned int high = 0;
- unsigned int counter_bit;
- struct p4_event_binding *ev = NULL;
- unsigned int stag;
-
- stag = get_stagger();
-
- /* convert from counter *number* to counter *bit* */
- counter_bit = 1 << VIRT_CTR(stag, ctr);
-
- /* find our event binding structure. */
- if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
- counter_config[ctr].event);
- return;
- }
-
- ev = &(p4_events[counter_config[ctr].event - 1]);
-
- for (i = 0; i < maxbind; i++) {
- if (ev->bindings[i].virt_counter & counter_bit) {
-
- /* modify ESCR */
- rdmsr(ev->bindings[i].escr_address, escr, high);
- ESCR_CLEAR(escr);
- if (stag == 0) {
- ESCR_SET_USR_0(escr, counter_config[ctr].user);
- ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
- } else {
- ESCR_SET_USR_1(escr, counter_config[ctr].user);
- ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
- }
- ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
- wrmsr(ev->bindings[i].escr_address, escr, high);
-
- /* modify CCCR */
- rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- CCCR_CLEAR(cccr);
- CCCR_SET_REQUIRED_BITS(cccr);
- CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0)
- CCCR_SET_PMI_OVF_0(cccr);
- else
- CCCR_SET_PMI_OVF_1(cccr);
- wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- return;
- }
- }
-
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
- counter_config[ctr].event, stag, ctr);
-}
-
-
-static void p4_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- unsigned int i;
- unsigned int low, high;
- unsigned int stag;
-
- stag = get_stagger();
-
- rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!MISC_PMC_ENABLED_P(low)) {
- printk(KERN_ERR "oprofile: P4 PMC not available\n");
- return;
- }
-
- /* clear the cccrs we will use */
- for (i = 0; i < num_counters; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_CLEAR(low);
- CCCR_SET_REQUIRED_BITS(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-
- /* clear all escrs (including those outside our concern) */
- for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- wrmsr(msrs->controls[i].addr, 0, 0);
- }
-
- /* setup all counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->controls[i].addr) {
- reset_value[i] = counter_config[i].count;
- pmc_setup_one_p4_counter(i);
- wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
- -(u64)counter_config[i].count);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int p4_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- unsigned long ctr, low, high, stag, real;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
- continue;
-
- /*
- * there is some eccentricity in the hardware which
- * requires that we perform 2 extra corrections:
- *
- * - check both the CCCR:OVF flag for overflow and the
- * counter high bit for un-flagged overflows.
- *
- * - write the counter back twice to ensure it gets
- * updated properly.
- *
- * the former seems to be related to extra NMIs happening
- * during the current NMI; the latter is reported as errata
- * N15 in intel doc 249199-029, pentium 4 specification
- * update, though their suggested work-around does not
- * appear to solve the problem.
- */
-
- real = VIRT_CTR(stag, i);
-
- rdmsr(p4_counters[real].cccr_address, low, high);
- rdmsr(p4_counters[real].counter_address, ctr, high);
- if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
- oprofile_add_sample(regs, i);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- CCCR_CLEAR_OVF(low);
- wrmsr(p4_counters[real].cccr_address, low, high);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- }
- }
-
- /* P4 quirk: you have to re-unmask the apic vector */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-
-static void p4_start(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_ENABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-
-static void p4_stop(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_DISABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-#ifdef CONFIG_SMP
-struct op_x86_model_spec op_p4_ht2_spec = {
- .num_counters = NUM_COUNTERS_HT2,
- .num_controls = NUM_CONTROLS_HT2,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
-#endif
-
-struct op_x86_model_spec op_p4_spec = {
- .num_counters = NUM_COUNTERS_NON_HT,
- .num_controls = NUM_CONTROLS_NON_HT,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
deleted file mode 100644
index 7913b6921959..000000000000
--- a/arch/x86/oprofile/op_model_ppro.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * @file op_model_ppro.h
- * Family 6 perfmon and architectural perfmon MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Copyright 2008 Intel Corporation
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Andi Kleen
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/slab.h>
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-#include <asm/nmi.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-static int num_counters = 2;
-static int counter_width = 32;
-
-#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
-
-static u64 reset_value[OP_MAX_COUNTER];
-
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
- }
-}
-
-static int ppro_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
- msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- ppro_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
- union cpuid10_eax eax;
- eax.full = cpuid_eax(0xa);
-
- /*
- * For Core2 (family 6, model 15), don't reset the
- * counter width:
- */
- if (!(eax.split.version_id == 0 &&
- __this_cpu_read(cpu_info.x86) == 6 &&
- __this_cpu_read(cpu_info.x86_model) == 15)) {
-
- if (counter_width < eax.split.bit_width)
- counter_width = eax.split.bit_width;
- }
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI *
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->counters[i].addr) {
- reset_value[i] = counter_config[i].count;
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[i]);
- wrmsrl(msrs->controls[i].addr, val);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int ppro_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- if (val & (1ULL << (counter_width - 1)))
- continue;
- oprofile_add_sample(regs, i);
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- }
-
- /* Only P6 based Pentium M need to re-unmask the apic vector but it
- * doesn't hurt other P6 variant */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* We can't work out if we really handled an interrupt. We
- * might have caught a *second* counter just after overflowing
- * the interrupt for this counter then arrives
- * and we don't find a counter that's overflowed, so we
- * would return 0 and get dazed + confused. Instead we always
- * assume we found an overflow. This sucks.
- */
- return 1;
-}
-
-
-static void ppro_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (reset_value[i]) {
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
- }
-}
-
-
-static void ppro_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-struct op_x86_model_spec op_ppro_spec = {
- .num_counters = 2,
- .num_controls = 2,
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .fill_in_addresses = &ppro_fill_in_addresses,
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
-
-/*
- * Architectural performance monitoring.
- *
- * Newer Intel CPUs (Core1+) have support for architectural
- * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
- * The advantage of this is that it can be done without knowing about
- * the specific CPU.
- */
-
-static void arch_perfmon_setup_counters(void)
-{
- union cpuid10_eax eax;
-
- eax.full = cpuid_eax(0xa);
-
- /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
- if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 15) {
- eax.split.version_id = 2;
- eax.split.num_counters = 2;
- eax.split.bit_width = 40;
- }
-
- num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
-
- op_arch_perfmon_spec.num_counters = num_counters;
- op_arch_perfmon_spec.num_controls = num_counters;
-}
-
-static int arch_perfmon_init(struct oprofile_operations *ignore)
-{
- arch_perfmon_setup_counters();
- return 0;
-}
-
-struct op_x86_model_spec op_arch_perfmon_spec = {
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .init = &arch_perfmon_init,
- /* num_counters/num_controls filled in at runtime */
- .fill_in_addresses = &ppro_fill_in_addresses,
- /* user space does the cpuid check for available events */
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
deleted file mode 100644
index 276cf79b5d24..000000000000
--- a/arch/x86/oprofile/op_x86_model.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * @file op_x86_model.h
- * interface to x86 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#ifndef OP_X86_MODEL_H
-#define OP_X86_MODEL_H
-
-#include <asm/types.h>
-#include <asm/perf_event.h>
-
-struct op_msr {
- unsigned long addr;
- u64 saved;
-};
-
-struct op_msrs {
- struct op_msr *counters;
- struct op_msr *controls;
- struct op_msr *multiplex;
-};
-
-struct pt_regs;
-
-struct oprofile_operations;
-
-/* The model vtable abstracts the differences between
- * various x86 CPU models' perfctr support.
- */
-struct op_x86_model_spec {
- unsigned int num_counters;
- unsigned int num_controls;
- unsigned int num_virt_counters;
- u64 reserved;
- u16 event_mask;
- int (*init)(struct oprofile_operations *ops);
- int (*fill_in_addresses)(struct op_msrs * const msrs);
- void (*setup_ctrs)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
- int (*check_ctrs)(struct pt_regs * const regs,
- struct op_msrs const * const msrs);
- void (*start)(struct op_msrs const * const msrs);
- void (*stop)(struct op_msrs const * const msrs);
- void (*shutdown)(struct op_msrs const * const msrs);
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- void (*switch_ctrl)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
-#endif
-};
-
-struct op_counter_config;
-
-static inline void op_x86_warn_in_use(int counter)
-{
- /*
- * The warning indicates an already running counter. If
- * oprofile doesn't collect data, then try using a different
- * performance counter on your platform to monitor the desired
- * event. Delete counter #%d from the desired event by editing
- * the /usr/share/oprofile/%s/<cpu>/events file. If the event
- * cannot be monitored by any other counter, contact your
- * hardware or BIOS vendor.
- */
- pr_warn("oprofile: counter #%d on cpu #%d may already be used\n",
- counter, smp_processor_id());
-}
-
-static inline void op_x86_warn_reserved(int counter)
-{
- pr_warn("oprofile: counter #%d is already reserved\n", counter);
-}
-
-extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config);
-extern int op_x86_phys_to_virt(int phys);
-extern int op_x86_virt_to_phys(int virt);
-
-extern struct op_x86_model_spec op_ppro_spec;
-extern struct op_x86_model_spec op_p4_spec;
-extern struct op_x86_model_spec op_p4_ht2_spec;
-extern struct op_x86_model_spec op_amd_spec;
-extern struct op_x86_model_spec op_arch_perfmon_spec;
-
-#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index fa855bbaebaf..f2f4a5d50b27 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -366,9 +366,9 @@ static int __init pcibios_assign_resources(void)
return 0;
}
-/**
- * called in fs_initcall (one below subsys_initcall),
- * give a chance for motherboard reserve resources
+/*
+ * This is an fs_initcall (one below subsys_initcall) in order to reserve
+ * resources properly.
*/
fs_initcall(pcibios_assign_resources);
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 00bfa1ebad6c..0bb3b8b44e4e 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -9,16 +9,23 @@
in the right sequence from here. */
static __init int pci_arch_init(void)
{
- int type;
-
- x86_create_pci_msi_domain();
+ int type, pcbios = 1;
type = pci_direct_probe();
if (!(pci_probe & PCI_PROBE_NOEARLY))
pci_mmcfg_early_init();
- if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
+ if (x86_init.pci.arch_init)
+ pcbios = x86_init.pci.arch_init();
+
+ /*
+ * Must happen after x86_init.pci.arch_init(). Xen sets up the
+ * x86_init.irqs.create_pci_msi_domain there.
+ */
+ x86_create_pci_msi_domain();
+
+ if (!pcbios)
return 0;
pci_pcbios_init();
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 24ca4ee2802f..95e2e6bd8d8c 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
struct irq_alloc_info info;
- int polarity;
+ bool polarity_low;
int ret;
u8 gsi;
@@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
switch (intel_mid_identify_cpu()) {
case INTEL_MID_CPU_CHIP_TANGIER:
- polarity = IOAPIC_POL_HIGH;
+ polarity_low = false;
/* Special treatment for IRQ0 */
if (gsi == 0) {
@@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
}
break;
default:
- polarity = IOAPIC_POL_LOW;
+ polarity_low = true;
break;
}
- ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+ ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);
/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 6fa42e9c4e6f..234998f196d4 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -425,7 +425,7 @@ static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl,
return AE_OK;
}
-static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used)
+static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used)
{
struct resource mcfg_res;
@@ -442,7 +442,7 @@ static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used)
return mcfg_res.flags;
}
-typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type);
+typedef bool (*check_reserved_t)(u64 start, u64 end, enum e820_type type);
static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
struct pci_mmcfg_region *cfg,
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 5701d5ba3df4..7d2525691854 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -11,7 +11,8 @@
#include <linux/pci_ids.h>
#include <linux/export.h>
#include <linux/list.h>
-#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
+#include <linux/swiotlb.h>
#include <asm/iommu.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index c552cd2d0632..3d41a09c2c14 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -152,7 +152,6 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
#if defined(CONFIG_PCI_MSI)
#include <linux/msi.h>
-#include <asm/msidef.h>
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
@@ -210,23 +209,20 @@ free:
return ret;
}
-#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
- MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
-
static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
struct msi_msg *msg)
{
- /* We set vector == 0 to tell the hypervisor we don't care about it,
- * but we want a pirq setup instead.
- * We use the dest_id field to pass the pirq that we want. */
- msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq);
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- MSI_ADDR_DEST_MODE_PHYSICAL |
- MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DEST_ID(pirq);
-
- msg->data = XEN_PIRQ_MSI_DATA;
+ /*
+ * We set vector == 0 to tell the hypervisor we don't care about
+ * it, but we want a pirq setup instead. We use the dest_id fields
+ * to pass the pirq that we want.
+ */
+ memset(msg, 0, sizeof(*msg));
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+ msg->arch_addr_hi.destid_8_31 = pirq >> 8;
+ msg->arch_addr_lo.destid_0_7 = pirq & 0xFF;
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
}
static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01..b2f90a1a89f1 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,7 +4,6 @@ obj-y += atom/
obj-y += ce4100/
obj-y += efi/
obj-y += geode/
-obj-y += goldfish/
obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8f5759df7776..1b82d77019b1 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -54,10 +54,7 @@
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
-
-struct efi_scratch efi_scratch;
-
-EXPORT_SYMBOL_GPL(efi_mm);
+static struct mm_struct *efi_prev_mm;
/*
* We need our own copy of the higher levels of the page tables
@@ -78,28 +75,30 @@ int __init efi_alloc_page_tables(void)
gfp_mask = GFP_KERNEL | __GFP_ZERO;
efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
- return -ENOMEM;
+ goto fail;
pgd = efi_pgd + pgd_index(EFI_VA_END);
p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
- if (!p4d) {
- free_page((unsigned long)efi_pgd);
- return -ENOMEM;
- }
+ if (!p4d)
+ goto free_pgd;
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
- if (!pud) {
- if (pgtable_l5_enabled())
- free_page((unsigned long) pgd_page_vaddr(*pgd));
- free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
- return -ENOMEM;
- }
+ if (!pud)
+ goto free_p4d;
efi_mm.pgd = efi_pgd;
mm_init_cpumask(&efi_mm);
init_new_context(NULL, &efi_mm);
return 0;
+
+free_p4d:
+ if (pgtable_l5_enabled())
+ free_page((unsigned long)pgd_page_vaddr(*pgd));
+free_pgd:
+ free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
+fail:
+ return -ENOMEM;
}
/*
@@ -113,31 +112,12 @@ void efi_sync_low_kernel_mappings(void)
pud_t *pud_k, *pud_efi;
pgd_t *efi_pgd = efi_mm.pgd;
- /*
- * We can share all PGD entries apart from the one entry that
- * covers the EFI runtime mapping space.
- *
- * Make sure the EFI runtime region mappings are guaranteed to
- * only span a single PGD entry and that the entry also maps
- * other important kernel regions.
- */
- MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
- MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
- (EFI_VA_END & PGDIR_MASK));
-
pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
pgd_k = pgd_offset_k(PAGE_OFFSET);
num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
- /*
- * As with PGDs, we share all P4D entries apart from the one entry
- * that covers the EFI runtime mapping space.
- */
- BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
- BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
-
pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
pgd_k = pgd_offset_k(EFI_VA_END);
p4d_efi = p4d_offset(pgd_efi, 0);
@@ -254,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1;
}
- efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
+ efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */
npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
@@ -479,11 +459,17 @@ void __init efi_dump_pagetable(void)
* can not change under us.
* It should be ensured that there are no concurent calls to this function.
*/
-void efi_switch_mm(struct mm_struct *mm)
+void efi_enter_mm(void)
+{
+ efi_prev_mm = current->active_mm;
+ current->active_mm = &efi_mm;
+ switch_mm(efi_prev_mm, &efi_mm, NULL);
+}
+
+void efi_leave_mm(void)
{
- efi_scratch.prev_mm = current->active_mm;
- current->active_mm = mm;
- switch_mm(efi_scratch.prev_mm, mm, NULL);
+ current->active_mm = efi_prev_mm;
+ switch_mm(&efi_mm, efi_prev_mm, NULL);
}
static DEFINE_SPINLOCK(efi_runtime_lock);
@@ -547,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size,
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
status = __efi_thunk(set_virtual_address_map, memory_map_size,
descriptor_size, descriptor_version, virtual_map);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
local_irq_restore(flags);
return status;
@@ -846,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_size,
descriptor_version,
virtual_map);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
- kernel_fpu_begin();
+ efi_fpu_begin();
/* Disable interrupts around EFI calls: */
local_irq_save(flags);
@@ -857,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_version, virtual_map);
local_irq_restore(flags);
- kernel_fpu_end();
+ efi_fpu_end();
/* grab the virtually remapped EFI runtime services table pointer */
efi.runtime = READ_ONCE(systab->runtime);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
return status;
}
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 26f0da238c1c..fd3dd1708eba 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk)
* Switch to 1:1 mapped 32-bit stack pointer.
*/
movq %rsp, %rax
- movq efi_scratch(%rip), %rsp
+ movq efi_mixed_mode_stack_pa(%rip), %rsp
push %rax
/*
@@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk)
pushl %ebp
lret
SYM_CODE_END(__efi64_thunk)
+
+ .bss
+ .balign 8
+SYM_DATA(efi_mixed_mode_stack_pa, .quad 0)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe411ebd..67d93a243c35 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
* @return: Returns, if the page fault is not handled. This function
* will never return if the page fault is handled successfully.
*/
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
{
if (!IS_ENABLED(CONFIG_X86_64))
return;
/*
+ * If we get an interrupt/NMI while processing an EFI runtime service
+ * then this is a regular OOPS, not an EFI failure.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
* Make sure that an efi runtime service caused the page fault.
+ * READ_ONCE() because we might be OOPSing in a different thread,
+ * and we don't want to trip KTSAN while trying to OOPS.
*/
- if (efi_rts_work.efi_rts_id == EFI_NONE)
+ if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+ current_work() != &efi_rts_work.work)
return;
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
- return;
}
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index c33f744b5388..b39bf3b5e108 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = {
static struct gpio_led alix_leds[] = {
{
.name = "alix:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "alix:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "alix:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = {
.leds = alix_leds,
};
+static struct gpiod_lookup_table alix_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device alix_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = {
static void __init register_alix(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&alix_leds_gpio_table);
platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
}
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index 73a3f49b4eb6..d263528c90bb 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = {
static struct gpio_led geos_leds[] = {
{
.name = "geos:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "geos:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "geos:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = {
.leds = geos_leds,
};
+static struct gpiod_lookup_table geos_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device geos_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = {
static void __init register_geos(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&geos_leds_gpio_table);
platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
}
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 163e1b545517..558384acd777 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <asm/geode.h>
@@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = {
static struct gpio_led net5501_leds[] = {
{
.name = "net5501:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 0,
},
};
@@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = {
.leds = net5501_leds,
};
+static struct gpiod_lookup_table net5501_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH),
+ { }
+ },
+};
+
static struct platform_device net5501_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = {
static void __init register_net5501(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&net5501_leds_gpio_table);
platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
}
diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
deleted file mode 100644
index 072c395379ac..000000000000
--- a/arch/x86/platform/goldfish/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_GOLDFISH) += goldfish.o
diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
deleted file mode 100644
index 6b6f8b4360dd..000000000000
--- a/arch/x86/platform/goldfish/goldfish.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007 Google, Inc.
- * Copyright (C) 2011 Intel, Inc.
- * Copyright (C) 2013 Intel, Inc.
- */
-
-#include <linux/kernel.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-
-/*
- * Where in virtual device memory the IO devices (timers, system controllers
- * and so on)
- */
-
-#define GOLDFISH_PDEV_BUS_BASE (0xff001000)
-#define GOLDFISH_PDEV_BUS_END (0xff7fffff)
-#define GOLDFISH_PDEV_BUS_IRQ (4)
-
-#define GOLDFISH_TTY_BASE (0x2000)
-
-static struct resource goldfish_pdev_bus_resources[] = {
- {
- .start = GOLDFISH_PDEV_BUS_BASE,
- .end = GOLDFISH_PDEV_BUS_END,
- .flags = IORESOURCE_MEM,
- },
- {
- .start = GOLDFISH_PDEV_BUS_IRQ,
- .end = GOLDFISH_PDEV_BUS_IRQ,
- .flags = IORESOURCE_IRQ,
- }
-};
-
-static bool goldfish_enable __initdata;
-
-static int __init goldfish_setup(char *str)
-{
- goldfish_enable = true;
- return 0;
-}
-__setup("goldfish", goldfish_setup);
-
-static int __init goldfish_init(void)
-{
- if (!goldfish_enable)
- return -ENODEV;
-
- platform_device_register_simple("goldfish_pdev_bus", -1,
- goldfish_pdev_bus_resources, 2);
- return 0;
-}
-device_initcall(goldfish_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
index 31dda18bb370..2930b6e9473e 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -88,8 +88,8 @@ static int __init bt_sfi_init(void)
memset(&info, 0, sizeof(info));
info.fwnode = ddata->dev->fwnode;
info.parent = ddata->dev;
- info.name = ddata->name,
- info.id = PLATFORM_DEVID_NONE,
+ info.name = ddata->name;
+ info.id = PLATFORM_DEVID_NONE;
pdev = platform_device_register_full(&info);
if (IS_ERR(pdev))
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
index 224ff0504890..1441dda8edf7 100644
--- a/arch/x86/platform/uv/Makefile
+++ b/arch/x86/platform/uv/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o
+obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_time.o uv_nmi.o
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 54511eaccf4d..bf31af3d32d6 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -72,6 +72,7 @@ static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
long sn_partition_id;
EXPORT_SYMBOL_GPL(sn_partition_id);
long sn_coherency_id;
+EXPORT_SYMBOL_GPL(sn_coherency_id);
long sn_region_size;
EXPORT_SYMBOL_GPL(sn_region_size);
long system_serial_number;
@@ -171,6 +172,60 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
(u64)decode, (u64)domain, (u64)bus, 0, 0);
}
+extern s64 uv_bios_get_master_nasid(u64 size, u64 *master_nasid)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, 0, UV_BIOS_EXTRA_MASTER_NASID, 0,
+ size, (u64)master_nasid);
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_master_nasid);
+
+extern s64 uv_bios_get_heapsize(u64 nasid, u64 size, u64 *heap_size)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_GET_HEAPSIZE,
+ 0, size, (u64)heap_size);
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_heapsize);
+
+extern s64 uv_bios_install_heap(u64 nasid, u64 heap_size, u64 *bios_heap)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_INSTALL_HEAP,
+ 0, heap_size, (u64)bios_heap);
+}
+EXPORT_SYMBOL_GPL(uv_bios_install_heap);
+
+extern s64 uv_bios_obj_count(u64 nasid, u64 size, u64 *objcnt)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_OBJECT_COUNT,
+ 0, size, (u64)objcnt);
+}
+EXPORT_SYMBOL_GPL(uv_bios_obj_count);
+
+extern s64 uv_bios_enum_objs(u64 nasid, u64 size, u64 *objbuf)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_OBJECTS,
+ 0, size, (u64)objbuf);
+}
+EXPORT_SYMBOL_GPL(uv_bios_enum_objs);
+
+extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 size, u64 *portbuf)
+{
+ return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_PORTS,
+ obj_id, size, (u64)portbuf);
+}
+EXPORT_SYMBOL_GPL(uv_bios_enum_ports);
+
+extern s64 uv_bios_get_geoinfo(u64 nasid, u64 size, u64 *buf)
+{
+ return uv_bios_call(UV_BIOS_GET_GEOINFO, nasid, (u64)buf, size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_geoinfo);
+
+extern s64 uv_bios_get_pci_topology(u64 size, u64 *buf)
+{
+ return uv_bios_call(UV_BIOS_GET_PCI_TOPOLOGY, (u64)buf, size, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_pci_topology);
+
unsigned long get_uv_systab_phys(bool msg)
{
if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) ||
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 18ca2261cc9a..1a536a187d74 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -35,8 +35,8 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
+ entry->delivery_mode = apic->delivery_mode;
+ entry->dest_mode = apic->dest_mode_logical;
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
deleted file mode 100644
index 266773e2fb37..000000000000
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
- */
-
-#include <linux/device.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-
-struct kobject *sgi_uv_kobj;
-
-static ssize_t partition_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
-}
-
-static ssize_t coherence_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id);
-}
-
-static struct kobj_attribute partition_id_attr =
- __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
-
-static struct kobj_attribute coherence_id_attr =
- __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
-
-
-static int __init sgi_uv_sysfs_init(void)
-{
- unsigned long ret;
-
- if (!is_uv_system())
- return -ENODEV;
-
- if (!sgi_uv_kobj)
- sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
- if (!sgi_uv_kobj) {
- printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
- return -EINVAL;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
- return ret;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
- return ret;
- }
-
- return 0;
-}
-
-device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index 7b37a412f829..f03b64d9cb51 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,7 +9,7 @@
*/
#include <linux/bug.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/purgatory.h>
#include "../boot/string.h"
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index ce7188cbdae5..1c3a1962cade 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
@@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 218acbd5c7a0..afc1da68b06d 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,6 +26,19 @@ config XEN_PV
help
Support running as a Xen PV guest.
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN_PV
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
+
config XEN_PV_SMP
def_bool y
depends on XEN_PV && SMP
@@ -39,28 +52,19 @@ config XEN_DOM0
Support running as a Xen PV Dom0 guest.
config XEN_PVHVM
- bool "Xen PVHVM guest support"
- default y
- depends on XEN && PCI && X86_LOCAL_APIC
- help
- Support running as a Xen PVHVM guest.
+ def_bool y
+ depends on XEN && X86_LOCAL_APIC
config XEN_PVHVM_SMP
def_bool y
depends on XEN_PVHVM && SMP
-config XEN_512GB
- bool "Limit Xen pv-domain memory to 512GB"
- depends on XEN_PV
+config XEN_PVHVM_GUEST
+ bool "Xen PVHVM guest support"
default y
+ depends on XEN_PVHVM && PCI
help
- Limit paravirtualized user domains to 512GB of RAM.
-
- The Xen tools and crash dump analysis tools might not support
- pv-domains with more than 512 GB of RAM. This option controls the
- default setting of the kernel to use only up to 512 GB or more.
- It is always possible to change the default via specifying the
- boot parameter "xen_512gb_limit".
+ Support running as a Xen PVHVM guest.
config XEN_SAVE_RESTORE
bool
@@ -76,7 +80,9 @@ config XEN_DEBUG_FS
Enabling this option may incur a significant performance overhead.
config XEN_PVH
- bool "Support for running as a Xen PVH guest"
+ bool "Xen PVH guest support"
depends on XEN && XEN_PVHVM && ACPI
select PVH
def_bool n
+ help
+ Support for running as a Xen PVH guest.
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index e82fd1910dae..0d46cc283cf5 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -148,15 +148,12 @@ static struct apic xen_pv_apic = {
.apic_id_valid = xen_id_always_valid,
.apic_id_registered = xen_id_always_registered,
- /* .irq_delivery_mode - used in native_compose_msi_msg only */
- /* .irq_dest_mode - used in native_compose_msi_msg only */
+ /* .delivery_mode and .dest_mode_logical not used by XENPV */
.disable_esr = 0,
- /* .dest_logical - default_send_IPI_ use it but we use our own. */
- .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
+ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
.init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
-
.ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
.setup_apic_routing = NULL,
.cpu_present_to_apicid = xen_cpu_present_to_apicid,
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 205a9bc981b0..7d7ffb9c826a 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -93,37 +93,22 @@ static efi_system_table_t __init *xen_efi_probe(void)
/*
* Determine whether we're in secure boot mode.
- *
- * Please keep the logic in sync with
- * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
*/
static enum efi_secureboot_mode xen_efi_get_secureboot(void)
{
- static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+ enum efi_secureboot_mode mode;
efi_status_t status;
- u8 moksbstate, secboot, setupmode;
+ u8 moksbstate;
unsigned long size;
- size = sizeof(secboot);
- status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
- NULL, &size, &secboot);
-
- if (status == EFI_NOT_FOUND)
- return efi_secureboot_mode_disabled;
-
- if (status != EFI_SUCCESS)
- goto out_efi_err;
-
- size = sizeof(setupmode);
- status = efi.get_variable(L"SetupMode", &efi_variable_guid,
- NULL, &size, &setupmode);
-
- if (status != EFI_SUCCESS)
- goto out_efi_err;
-
- if (secboot == 0 || setupmode == 1)
- return efi_secureboot_mode_disabled;
+ mode = efi_get_secureboot_mode(efi.get_variable);
+ if (mode == efi_secureboot_mode_unknown) {
+ pr_err("Could not determine UEFI Secure Boot status.\n");
+ return efi_secureboot_mode_unknown;
+ }
+ if (mode != efi_secureboot_mode_enabled)
+ return mode;
/* See if a user has put the shim into insecure mode. */
size = sizeof(moksbstate);
@@ -140,10 +125,6 @@ static enum efi_secureboot_mode xen_efi_get_secureboot(void)
secure_boot_enabled:
pr_info("UEFI Secure Boot is enabled.\n");
return efi_secureboot_mode_enabled;
-
- out_efi_err:
- pr_err("Could not determine UEFI Secure Boot status.\n");
- return efi_secureboot_mode_unknown;
}
void __init xen_efi_init(struct boot_params *boot_params)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 9e87ab010c82..e68ea5f4ad1c 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -164,10 +164,10 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
else
per_cpu(xen_vcpu_id, cpu) = cpu;
rc = xen_vcpu_setup(cpu);
- if (rc)
+ if (rc || !xen_have_vector_callback)
return rc;
- if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
+ if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
rc = xen_smp_intr_init(cpu);
@@ -188,6 +188,8 @@ static int xen_cpu_dead_hvm(unsigned int cpu)
return 0;
}
+static bool no_vector_callback __initdata;
+
static void __init xen_hvm_guest_init(void)
{
if (xen_pv_domain())
@@ -207,7 +209,7 @@ static void __init xen_hvm_guest_init(void)
xen_panic_handler_init();
- if (xen_feature(XENFEAT_hvm_callback_vector))
+ if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
xen_hvm_smp_init();
@@ -233,6 +235,13 @@ static __init int xen_parse_nopv(char *arg)
}
early_param("xen_nopv", xen_parse_nopv);
+static __init int xen_parse_no_vector_callback(char *arg)
+{
+ no_vector_callback = true;
+ return 0;
+}
+early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
+
bool __init xen_hvm_need_lapic(void)
{
if (xen_pv_domain())
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 4409306364dc..dc0a337f985b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
- /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
+ /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
+DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
+{
+ /* On Xen PV, DF doesn't use IST. The C part is the same as native. */
+ exc_double_fault(regs, error_code);
+}
+
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
@@ -583,6 +589,27 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
exc_debug(regs);
}
+DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
+{
+ /* This should never happen and there is no way to handle it. */
+ pr_err("Unknown trap in Xen PV mode.");
+ BUG();
+}
+
+#ifdef CONFIG_X86_MCE
+DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
+{
+ /*
+ * There's no IST on Xen PV, but we still need to dispatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_machine_check(regs);
+ else
+ exc_machine_check(regs);
+}
+#endif
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
@@ -601,9 +628,9 @@ struct trap_array_entry {
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
- TRAP_ENTRY(exc_double_fault, true ),
+ TRAP_ENTRY_REDIR(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
- TRAP_ENTRY(exc_machine_check, true ),
+ TRAP_ENTRY_REDIR(exc_machine_check, true ),
#endif
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
@@ -631,6 +658,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist)
{
unsigned int nr;
bool ist_okay = false;
+ bool found = false;
/*
* Replace trap handler addresses by Xen specific ones.
@@ -645,6 +673,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist)
if (*addr == entry->orig) {
*addr = entry->xen;
ist_okay = entry->ist_okay;
+ found = true;
break;
}
}
@@ -655,9 +684,13 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist)
nr = (*addr - (void *)early_idt_handler_array[0]) /
EARLY_IDT_HANDLER_SIZE;
*addr = (void *)xen_early_idt_handler_array[nr];
+ found = true;
}
- if (WARN_ON(ist != 0 && !ist_okay))
+ if (!found)
+ *addr = (void *)xen_asm_exc_xen_unknown_trap;
+
+ if (WARN_ON(found && ist != 0 && !ist_okay))
return false;
return true;
@@ -1002,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void)
*/
if (xen_have_vcpu_info_placement) {
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.restore_fl =
- __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_ops.irq.irq_disable =
__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_ops.irq.irq_enable =
@@ -1040,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
- .usergs_sysret64 = xen_sysret64,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
@@ -1065,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- /* Xen takes care of %gs when switching to usermode for us */
- .swapgs = paravirt_nop,
-
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 850c93f346c7..dfa091d79c2e 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-__visible void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* See xen_irq_enable() for why preemption must be disabled. */
- preempt_disable();
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
-
- if (flags == 0) {
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
- preempt_enable();
- } else
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-
asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
@@ -118,7 +96,6 @@ static void xen_halt(void)
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index be4151f42611..3301875dd196 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -795,17 +795,7 @@ static int p2m_dump_show(struct seq_file *m, void *v)
return 0;
}
-static int p2m_dump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, p2m_dump_show, NULL);
-}
-
-static const struct file_operations p2m_dump_fops = {
- .open = p2m_dump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(p2m_dump);
static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index f5e7db4f82ab..6ff3c887e0b9 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -33,9 +33,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
int cpu;
native_smp_prepare_cpus(max_cpus);
- WARN_ON(xen_smp_intr_init(0));
- xen_init_lock_cpu(0);
+ if (xen_have_vector_callback) {
+ WARN_ON(xen_smp_intr_init(0));
+ xen_init_lock_cpu(0);
+ }
for_each_possible_cpu(cpu) {
if (cpu == 0)
@@ -50,9 +52,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
static void xen_hvm_cpu_die(unsigned int cpu)
{
if (common_cpu_die(cpu) == 0) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
+ if (xen_have_vector_callback) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ }
}
}
#else
@@ -64,14 +68,19 @@ static void xen_hvm_cpu_die(unsigned int cpu)
void __init xen_hvm_smp_init(void)
{
- if (!xen_have_vector_callback)
+ smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_cpus_done = xen_smp_cpus_done;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+
+ if (!xen_have_vector_callback) {
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ nopvspin = true;
+#endif
return;
+ }
- smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
- smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
- smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
- smp_ops.smp_cpus_done = xen_smp_cpus_done;
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 799f4eba0a62..043c73dfd2c9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -93,10 +93,20 @@ void xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
+ int irq;
+
if (!xen_pvspin)
return;
- unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ /*
+ * When booting the kernel with 'mitigations=auto,nosmt', the secondary
+ * CPUs are not activated, and lock_kicker_irq is not initialized.
+ */
+ irq = per_cpu(lock_kicker_irq, cpu);
+ if (irq == -1)
+ return;
+
+ unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
kfree(per_cpu(irq_name, cpu));
per_cpu(irq_name, cpu) = NULL;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 1cb0e84b9161..02f31341e435 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -72,34 +72,6 @@ SYM_FUNC_START(xen_save_fl_direct)
ret
SYM_FUNC_END(xen_save_fl_direct)
-
-/*
- * In principle the caller should be passing us a value return from
- * xen_save_fl_direct, but for robustness sake we test only the
- * X86_EFLAGS_IF flag rather than the whole byte. After setting the
- * interrupt mask state, it checks for unmasked pending events and
- * enters the hypervisor to get them delivered if so.
- */
-SYM_FUNC_START(xen_restore_fl_direct)
- FRAME_BEGIN
- testw $X86_EFLAGS_IF, %di
- setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /*
- * Preempt here doesn't matter because that will deal with any
- * pending interrupts. The pending check may end up being run
- * on the wrong CPU, but that doesn't hurt.
- */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
- jnz 1f
- call check_events
-1:
- FRAME_END
- ret
-SYM_FUNC_END(xen_restore_fl_direct)
-
-
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
@@ -161,7 +133,7 @@ xen_pv_trap asm_exc_overflow
xen_pv_trap asm_exc_bounds
xen_pv_trap asm_exc_invalid_op
xen_pv_trap asm_exc_device_not_available
-xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_xenpv_exc_double_fault
xen_pv_trap asm_exc_coproc_segment_overrun
xen_pv_trap asm_exc_invalid_tss
xen_pv_trap asm_exc_segment_not_present
@@ -172,12 +144,13 @@ xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
-xen_pv_trap asm_exc_machine_check
+xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
xen_pv_trap entry_INT80_compat
#endif
+xen_pv_trap asm_exc_xen_unknown_trap
xen_pv_trap asm_exc_xen_hypervisor_callback
__INIT
@@ -214,26 +187,6 @@ SYM_CODE_START(xen_iret)
jmp hypercall_iret
SYM_CODE_END(xen_iret)
-SYM_CODE_START(xen_sysret64)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back.
- *
- * tss.sp2 is scratch space.
- */
- movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER_DS
- pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- pushq %r11
- pushq $__USER_CS
- pushq %rcx
-
- pushq $VGCF_in_syscall
- jmp hypercall_iret
-SYM_CODE_END(xen_sysret64)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9546c3384c75..8d7ec49a35fb 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params)
__visible void xen_irq_enable_direct(void);
__visible void xen_irq_disable_direct(void);
__visible unsigned long xen_save_fl_direct(void);
-__visible void xen_restore_fl_direct(unsigned long);
__visible unsigned long xen_read_cr2(void);
__visible unsigned long xen_read_cr2_direct(void);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-__visible void xen_sysret32(void);
-__visible void xen_sysret64(void);
extern int xen_panic_handler_init(void);