summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig28
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile31
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/eboot.c10
-rw-r--r--arch/x86/boot/compressed/kaslr.c18
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S19
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/boot/header.S6
-rw-r--r--arch/x86/boot/tools/build.c7
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c1
-rw-r--r--arch/x86/crypto/aegis128l-aesni-glue.c1
-rw-r--r--arch/x86/crypto/aegis256-aesni-glue.c1
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S66
-rw-r--r--arch/x86/crypto/morus1280-sse2-glue.c1
-rw-r--r--arch/x86/crypto/morus640-sse2-glue.c1
-rw-r--r--arch/x86/entry/calling.h2
-rw-r--r--arch/x86/entry/entry_32.S21
-rw-r--r--arch/x86/entry/entry_64.S138
-rw-r--r--arch/x86/entry/vdso/Makefile16
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c220
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c8
-rw-r--r--arch/x86/entry/vdso/vma.c38
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c9
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c51
-rw-r--r--arch/x86/events/amd/core.c4
-rw-r--r--arch/x86/events/amd/uncore.c30
-rw-r--r--arch/x86/events/core.c43
-rw-r--r--arch/x86/events/intel/core.c346
-rw-r--r--arch/x86/events/intel/cstate.c8
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/pt.c2
-rw-r--r--arch/x86/events/intel/rapl.c4
-rw-r--r--arch/x86/events/intel/uncore_snbep.c14
-rw-r--r--arch/x86/events/msr.c8
-rw-r--r--arch/x86/events/perf_event.h4
-rw-r--r--arch/x86/hyperv/Makefile4
-rw-r--r--arch/x86/hyperv/hv_apic.c9
-rw-r--r--arch/x86/hyperv/hv_init.c19
-rw-r--r--arch/x86/hyperv/hv_spinlock.c88
-rw-r--r--arch/x86/hyperv/mmu.c4
-rw-r--r--arch/x86/include/asm/acpi.h7
-rw-r--r--arch/x86/include/asm/alternative-asm.h20
-rw-r--r--arch/x86/include/asm/alternative.h11
-rw-r--r--arch/x86/include/asm/amd_nb.h3
-rw-r--r--arch/x86/include/asm/asm.h57
-rw-r--r--arch/x86/include/asm/atomic.h20
-rw-r--r--arch/x86/include/asm/atomic64_32.h8
-rw-r--r--arch/x86/include/asm/atomic64_64.h20
-rw-r--r--arch/x86/include/asm/bitops.h9
-rw-r--r--arch/x86/include/asm/bug.h98
-rw-r--r--arch/x86/include/asm/cacheinfo.h1
-rw-r--r--arch/x86/include/asm/cmpxchg.h10
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h82
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/desc.h4
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/elf.h9
-rw-r--r--arch/x86/include/asm/extable.h3
-rw-r--r--arch/x86/include/asm/fixmap.h12
-rw-r--r--arch/x86/include/asm/fpu/internal.h4
-rw-r--r--arch/x86/include/asm/fsgsbase.h49
-rw-r--r--arch/x86/include/asm/futex.h6
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h21
-rw-r--r--arch/x86/include/asm/intel-family.h33
-rw-r--r--arch/x86/include/asm/io.h15
-rw-r--r--arch/x86/include/asm/irqflags.h19
-rw-r--r--arch/x86/include/asm/jump_label.h80
-rw-r--r--arch/x86/include/asm/kdebug.h12
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h27
-rw-r--r--arch/x86/include/asm/local.h8
-rw-r--r--arch/x86/include/asm/mce.h55
-rw-r--r--arch/x86/include/asm/mem_encrypt.h7
-rw-r--r--arch/x86/include/asm/mmu_context.h4
-rw-r--r--arch/x86/include/asm/mpx.h12
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nospec-branch.h17
-rw-r--r--arch/x86/include/asm/page_64_types.h15
-rw-r--r--arch/x86/include/asm/paravirt.h415
-rw-r--r--arch/x86/include/asm/paravirt_types.h138
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/perf_event.h9
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h23
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/include/asm/processor.h17
-rw-r--r--arch/x86/include/asm/ptrace.h48
-rw-r--r--arch/x86/include/asm/qspinlock.h15
-rw-r--r--arch/x86/include/asm/refcount.h79
-rw-r--r--arch/x86/include/asm/rmwcc.h69
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/segment.h48
-rw-r--r--arch/x86/include/asm/signal.h7
-rw-r--r--arch/x86/include/asm/special_insns.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_64.h20
-rw-r--r--arch/x86/include/asm/suspend.h8
-rw-r--r--arch/x86/include/asm/suspend_32.h4
-rw-r--r--arch/x86/include/asm/tlb.h21
-rw-r--r--arch/x86/include/asm/tlbflush.h73
-rw-r--r--arch/x86/include/asm/trace/mpx.h4
-rw-r--r--arch/x86/include/asm/uaccess.h22
-rw-r--r--arch/x86/include/asm/uv/uv.h6
-rw-r--r--arch/x86/include/asm/vgtod.h79
-rw-r--r--arch/x86/include/asm/virtext.h5
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/events.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/siginfo.h2
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/alternative.c15
-rw-r--r--arch/x86/kernel/amd_gart_64.c6
-rw-r--r--arch/x86/kernel/amd_nb.c49
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c11
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c18
-rw-r--r--arch/x86/kernel/asm-offsets_64.c9
-rw-r--r--arch/x86/kernel/check.c28
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c108
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c31
-rw-r--r--arch/x86/kernel/cpu/common.c78
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/hygon.c408
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c17
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h23
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c39
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c405
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c263
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c22
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c24
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c17
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c14
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c4
-rw-r--r--arch/x86/kernel/crash_dump_64.c60
-rw-r--r--arch/x86/kernel/dumpstack.c31
-rw-r--r--arch/x86/kernel/eisa.c10
-rw-r--r--arch/x86/kernel/fpu/signal.c1
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c22
-rw-r--r--arch/x86/kernel/head_64.S18
-rw-r--r--arch/x86/kernel/jump_label.c62
-rw-r--r--arch/x86/kernel/kprobes/core.c48
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/kvm.c19
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/macros.S16
-rw-r--r--arch/x86/kernel/module.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c15
-rw-r--r--arch/x86/kernel/paravirt.c320
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c87
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c97
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c197
-rw-r--r--arch/x86/kernel/ptrace.c57
-rw-r--r--arch/x86/kernel/setup.c19
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/time.c24
-rw-r--r--arch/x86/kernel/topology.c4
-rw-r--r--arch/x86/kernel/traps.c196
-rw-r--r--arch/x86/kernel/tsc.c16
-rw-r--r--arch/x86/kernel/tsc_msr.c10
-rw-r--r--arch/x86/kernel/umip.c8
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S29
-rw-r--r--arch/x86/kernel/vsmp_64.c26
-rw-r--r--arch/x86/kernel/x86_init.c3
-rw-r--r--arch/x86/kvm/emulate.c11
-rw-r--r--arch/x86/kvm/lapic.c49
-rw-r--r--arch/x86/kvm/mmu.c70
-rw-r--r--arch/x86/kvm/svm.c32
-rw-r--r--arch/x86/kvm/vmx.c300
-rw-r--r--arch/x86/kvm/x86.c131
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lib/checksum_32.S4
-rw-r--r--arch/x86/lib/copy_user_64.S90
-rw-r--r--arch/x86/lib/csum-copy_64.S8
-rw-r--r--arch/x86/lib/getuser.S12
-rw-r--r--arch/x86/lib/putuser.S10
-rw-r--r--arch/x86/lib/usercopy.c5
-rw-r--r--arch/x86/lib/usercopy_32.c126
-rw-r--r--arch/x86/lib/usercopy_64.c8
-rw-r--r--arch/x86/mm/cpu_entry_area.c36
-rw-r--r--arch/x86/mm/dump_pagetables.c35
-rw-r--r--arch/x86/mm/extable.c114
-rw-r--r--arch/x86/mm/fault.c480
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c23
-rw-r--r--arch/x86/mm/ioremap.c24
-rw-r--r--arch/x86/mm/mem_encrypt.c24
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c1
-rw-r--r--arch/x86/mm/mpx.c30
-rw-r--r--arch/x86/mm/pageattr.c652
-rw-r--r--arch/x86/mm/pgtable.c27
-rw-r--r--arch/x86/mm/pti.c35
-rw-r--r--arch/x86/mm/tlb.c197
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/amd_bus.c6
-rw-r--r--arch/x86/pci/fixup.c12
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c6
-rw-r--r--arch/x86/platform/efi/early_printk.c8
-rw-r--r--arch/x86/platform/efi/efi_32.c7
-rw-r--r--arch/x86/platform/efi/efi_64.c10
-rw-r--r--arch/x86/platform/efi/quirks.c78
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c17
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c2
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-rtc.c3
-rw-r--r--arch/x86/platform/ts5500/ts5500.c1
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/hibernate.c248
-rw-r--r--arch/x86/power/hibernate_32.c52
-rw-r--r--arch/x86/power/hibernate_64.c224
-rw-r--r--arch/x86/power/hibernate_asm_32.S37
-rw-r--r--arch/x86/power/hibernate_asm_64.S2
-rw-r--r--arch/x86/tools/relocs.c10
-rw-r--r--arch/x86/um/asm/elf.h3
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile41
-rw-r--r--arch/x86/xen/efi.c14
-rw-r--r--arch/x86/xen/enlighten.c3
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c31
-rw-r--r--arch/x86/xen/enlighten_pvh.c3
-rw-r--r--arch/x86/xen/grant-table.c25
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c188
-rw-r--r--arch/x86/xen/mmu_hvm.c2
-rw-r--r--arch/x86/xen/mmu_pv.c185
-rw-r--r--arch/x86/xen/p2m.c2
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c2
-rw-r--r--arch/x86/xen/platform-pci-unplug.c19
-rw-r--r--arch/x86/xen/pmu.c15
-rw-r--r--arch/x86/xen/smp_pv.c2
-rw-r--r--arch/x86/xen/spinlock.c11
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/vdso.h2
-rw-r--r--arch/x86/xen/xen-asm_64.S8
-rw-r--r--arch/x86/xen/xen-pvh.S15
267 files changed, 6302 insertions, 3905 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c5ff296bc5d1..cbd5f28ea8e2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -48,6 +48,7 @@ config X86
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ANON_INODES
select ARCH_CLOCKSOURCE_DATA
+ select ARCH_CLOCKSOURCE_INIT
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEBUG_VIRTUAL
@@ -119,6 +120,7 @@ config X86
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS if MMU
@@ -447,7 +449,6 @@ config RETPOLINE
config INTEL_RDT
bool "Intel Resource Director Technology support"
- default n
depends on X86 && CPU_SUP_INTEL
select KERNFS
help
@@ -523,6 +524,7 @@ config X86_VSMP
bool "ScaleMP vSMP"
select HYPERVISOR_GUEST
select PARAVIRT
+ select PARAVIRT_XXL
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
depends on SMP
@@ -701,7 +703,6 @@ config STA2X11
select SWIOTLB
select MFD_STA2X11
select GPIOLIB
- default n
---help---
This adds support for boards based on the STA2X11 IO-Hub,
a.k.a. "ConneXt". The chip is used in place of the standard
@@ -754,6 +755,9 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_XXL
+ bool
+
config PARAVIRT_DEBUG
bool "paravirt-ops debugging"
depends on PARAVIRT && DEBUG_KERNEL
@@ -799,7 +803,6 @@ config KVM_GUEST
config KVM_DEBUG_FS
bool "Enable debug information for KVM Guests in debugfs"
depends on KVM_GUEST && DEBUG_FS
- default n
---help---
This option enables collection of various statistics for KVM guest.
Statistics are displayed in debugfs filesystem. Enabling this option
@@ -808,7 +811,6 @@ config KVM_DEBUG_FS
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
- default n
---help---
Select this option to enable fine granularity task steal time
accounting. Time spent executing other tasks in parallel with
@@ -1168,7 +1170,6 @@ source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
bool "Legacy VM86 support"
- default n
depends on X86_32
---help---
This option allows user programs to put the CPU into V8086
@@ -1491,6 +1492,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
+config X86_CPA_STATISTICS
+ bool "Enable statistic for Change Page Attribute"
+ depends on DEBUG_FS
+ ---help---
+ Expose statistics about the Change Page Attribute mechanims, which
+ helps to determine the effectivness of preserving large and huge
+ page mappings when mapping protections are changed.
+
config ARCH_HAS_MEM_ENCRYPT
def_bool y
@@ -2220,7 +2229,6 @@ config HOTPLUG_CPU
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
- default n
depends on HOTPLUG_CPU
---help---
Set whether default state of cpu0_hotpluggable is on or off.
@@ -2422,7 +2430,7 @@ menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
def_bool y
- depends on X86_64 && HIBERNATION
+ depends on HIBERNATION
source "kernel/power/Kconfig"
@@ -2742,8 +2750,7 @@ config OLPC
config OLPC_XO1_PM
bool "OLPC XO-1 Power Management"
- depends on OLPC && MFD_CS5535 && PM_SLEEP
- select MFD_CORE
+ depends on OLPC && MFD_CS5535=y && PM_SLEEP
---help---
Add support for poweroff and suspend of the OLPC XO-1 laptop.
@@ -2825,7 +2832,6 @@ source "drivers/pcmcia/Kconfig"
config RAPIDIO
tristate "RapidIO support"
depends on PCI
- default n
help
If enabled this option will include drivers and the core
infrastructure code to support RapidIO interconnect devices.
@@ -2843,7 +2849,7 @@ config X86_SYSFB
This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
framebuffers so the new generic system-framebuffer drivers can be
used on x86. If the framebuffer is not compatible with the generic
- modes, it is adverticed as fallback platform framebuffer so legacy
+ modes, it is advertised as fallback platform framebuffer so legacy
drivers like efifb, vesafb and uvesafb can pick it up.
If this option is not selected, all system framebuffers are always
marked as fallback platform framebuffers as usual.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 638411f22267..6adce15268bd 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -426,6 +426,20 @@ config CPU_SUP_AMD
If unsure, say N.
+config CPU_SUP_HYGON
+ default y
+ bool "Support Hygon processors" if PROCESSOR_SELECT
+ select CPU_SUP_AMD
+ help
+ This enables detection, tunings and quirks for Hygon processors
+
+ You need this enabled if you want your kernel to run on an
+ Hygon CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an Hygon
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
config CPU_SUP_CENTAUR
default y
bool "Support Centaur processors" if PROCESSOR_SELECT
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7d68f0c7cfb1..0723dff17e6c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -314,7 +314,6 @@ config DEBUG_NMI_SELFTEST
config DEBUG_IMR_SELFTEST
bool "Isolated Memory Region self test"
- default n
depends on INTEL_IMR
---help---
This option enables automated sanity testing of the IMR code.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 94859241bc3e..5b562e464009 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -175,22 +175,6 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
endif
endif
-ifndef CC_HAVE_ASM_GOTO
- $(error Compiler lacks asm-goto support.)
-endif
-
-#
-# Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a
-# GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way
-# to test for this bug at compile-time because the test case needs to execute,
-# which is a no-go for cross compilers. So check the GCC version instead.
-#
-ifdef CONFIG_JUMP_LABEL
- ifneq ($(ACCUMULATE_OUTGOING_ARGS), 1)
- ACCUMULATE_OUTGOING_ARGS = $(call cc-if-fullversion, -lt, 040502, 1)
- endif
-endif
-
ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
# This compiler flag is not supported by Clang:
KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
@@ -209,7 +193,6 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
-asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
@@ -253,6 +236,13 @@ archscripts: scripts_basic
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
+archmacros:
+ $(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s
+
+ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s -Wa,-
+export ASM_MACRO_FLAGS
+KBUILD_CFLAGS += $(ASM_MACRO_FLAGS)
+
###
# Kernel objects
@@ -312,6 +302,13 @@ PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
+archprepare: checkbin
+checkbin:
+ifndef CC_HAVE_ASM_GOTO
+ @echo Compiler lacks asm-goto support.
+ @exit 1
+endif
+
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 28764dacf018..466f66c8a7f8 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -37,6 +37,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += -Wno-pointer-sign
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 1458b1700fc7..8b4c5e001157 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -738,6 +738,7 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
struct desc_struct *desc;
void *handle;
efi_system_table_t *_table;
+ unsigned long cmdline_paddr;
efi_early = c;
@@ -756,6 +757,15 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
setup_boot_services32(efi_early);
/*
+ * make_boot_params() may have been called before efi_main(), in which
+ * case this is the second time we parse the cmdline. This is ok,
+ * parsing the cmdline multiple times does not have side-effects.
+ */
+ cmdline_paddr = ((u64)hdr->cmd_line_ptr |
+ ((u64)boot_params->ext_cmd_line_ptr << 32));
+ efi_parse_options((char *)cmdline_paddr);
+
+ /*
* If the boot loader gave us a value for secure_boot then we use that,
* otherwise we ask the BIOS.
*/
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index d1e19f358b6e..9ed9709d9947 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -241,7 +241,7 @@ static void parse_gb_huge_pages(char *param, char *val)
}
-static int handle_mem_options(void)
+static void handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
size_t len = strlen((char *)args);
@@ -251,7 +251,7 @@ static int handle_mem_options(void)
if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
!strstr(args, "hugepages"))
- return 0;
+ return;
tmp_cmdline = malloc(len + 1);
if (!tmp_cmdline)
@@ -269,8 +269,7 @@ static int handle_mem_options(void)
/* Stop at -- */
if (!val && strcmp(param, "--") == 0) {
warn("Only '--' specified in cmdline");
- free(tmp_cmdline);
- return -1;
+ goto out;
}
if (!strcmp(param, "memmap")) {
@@ -283,16 +282,16 @@ static int handle_mem_options(void)
if (!strcmp(p, "nopentium"))
continue;
mem_size = memparse(p, &p);
- if (mem_size == 0) {
- free(tmp_cmdline);
- return -EINVAL;
- }
+ if (mem_size == 0)
+ goto out;
+
mem_limit = mem_size;
}
}
+out:
free(tmp_cmdline);
- return 0;
+ return;
}
/*
@@ -578,7 +577,6 @@ static void process_mem_region(struct mem_vector *entry,
unsigned long image_size)
{
struct mem_vector region, overlap;
- struct slot_area slot_area;
unsigned long start_orig, end;
struct mem_vector cur_entry;
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index eaa843a52907..a480356e0ed8 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -25,20 +25,6 @@ ENTRY(get_sev_encryption_bit)
push %ebx
push %ecx
push %edx
- push %edi
-
- /*
- * RIP-relative addressing is needed to access the encryption bit
- * variable. Since we are running in 32-bit mode we need this call/pop
- * sequence to get the proper relative addressing.
- */
- call 1f
-1: popl %edi
- subl $1b, %edi
-
- movl enc_bit(%edi), %eax
- cmpl $0, %eax
- jge .Lsev_exit
/* Check if running under a hypervisor */
movl $1, %eax
@@ -69,15 +55,12 @@ ENTRY(get_sev_encryption_bit)
movl %ebx, %eax
andl $0x3f, %eax /* Return the encryption bit location */
- movl %eax, enc_bit(%edi)
jmp .Lsev_exit
.Lno_sev:
xor %eax, %eax
- movl %eax, enc_bit(%edi)
.Lsev_exit:
- pop %edi
pop %edx
pop %ecx
pop %ebx
@@ -113,8 +96,6 @@ ENTRY(set_sev_encryption_mask)
ENDPROC(set_sev_encryption_mask)
.data
-enc_bit:
- .int 0xffffffff
#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index a423bdb42686..a1d5918765f3 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
* paravirt and debugging variants are added.)
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 850b8762e889..4c881c850125 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -300,7 +300,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x020d # header version number (>= 0x0105)
+ .word 0x020e # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -558,6 +558,10 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
init_size: .long INIT_SIZE # kernel initialization size
handover_offset: .long 0 # Filled in by build.c
+acpi_rsdp_addr: .quad 0 # 64-bit physical pointer to the
+ # ACPI RSDP table, added with
+ # version 2.14
+
# End of setup header #####################################################
.section ".entrytext", "ax"
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index d4e6cd4577e5..bf0e82400358 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -391,6 +391,13 @@ int main(int argc, char ** argv)
die("Unable to mmap '%s': %m", argv[2]);
/* Number of 16-byte paragraphs, including space for a 4-byte CRC */
sys_size = (sz + 15 + 4) / 16;
+#ifdef CONFIG_EFI_STUB
+ /*
+ * COFF requires minimum 32-byte alignment of sections, and
+ * adding a signature is problematic without that alignment.
+ */
+ sys_size = (sys_size + 1) & ~1;
+#endif
/* Patch the setup code with the appropriate size parameters */
buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 0eb9f92f3717..6c3ab05c231d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -247,6 +247,7 @@ CONFIG_USB_HIDDEV=y
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e32fc1f274d8..ac9ae487cfeb 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -243,6 +243,7 @@ CONFIG_USB_HIDDEV=y
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index acd11b3bf639..2a356b948720 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
index 2071c3d1ae07..dbe8bb980da1 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
index b5f2a8fd5a71..8bebda2de92f 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 9bd139569b41..cb2deb61c5d9 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -223,34 +223,34 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
pcmpeqd TWOONE(%rip), \TMP2
pand POLY(%rip), \TMP2
pxor \TMP2, \TMP3
- movdqa \TMP3, HashKey(%arg2)
+ movdqu \TMP3, HashKey(%arg2)
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
- movdqa \TMP1, HashKey_k(%arg2)
+ movdqu \TMP1, HashKey_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
- movdqa \TMP5, HashKey_2(%arg2)
+ movdqu \TMP5, HashKey_2(%arg2)
# HashKey_2 = HashKey^2<<1 (mod poly)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_2_k(%arg2)
+ movdqu \TMP1, HashKey_2_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_3(%arg2)
+ movdqu \TMP5, HashKey_3(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_3_k(%arg2)
+ movdqu \TMP1, HashKey_3_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_4(%arg2)
+ movdqu \TMP5, HashKey_4(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_4_k(%arg2)
+ movdqu \TMP1, HashKey_4_k(%arg2)
.endm
# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
@@ -271,7 +271,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- movdqa HashKey(%arg2), %xmm13
+ movdqu HashKey(%arg2), %xmm13
CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
%xmm4, %xmm5, %xmm6
@@ -997,7 +997,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM5, \TMP6
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa HashKey_4(%arg2), \TMP5
+ movdqu HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
@@ -1016,7 +1016,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
- movdqa HashKey_4_k(%arg2), \TMP5
+ movdqu HashKey_4_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
AESENC \TMP1, \XMM1 # Round 1
@@ -1031,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
- movdqa HashKey_3(%arg2), \TMP5
+ movdqu HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 3
@@ -1044,7 +1044,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
- movdqa HashKey_3_k(%arg2), \TMP5
+ movdqu HashKey_3_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 5
@@ -1058,7 +1058,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM7, \TMP1
pshufd $78, \XMM7, \TMP2
pxor \XMM7, \TMP2
- movdqa HashKey_2(%arg2), \TMP5
+ movdqu HashKey_2(%arg2), \TMP5
# Multiply TMP5 * HashKey using karatsuba
@@ -1074,7 +1074,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
- movdqa HashKey_2_k(%arg2), \TMP5
+ movdqu HashKey_2_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 8
@@ -1092,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM8, \TMP1
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
- movdqa HashKey(%arg2), \TMP5
+ movdqu HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 9
@@ -1121,7 +1121,7 @@ aes_loop_par_enc_done\@:
AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3
AESENCLAST \TMP3, \XMM4
- movdqa HashKey_k(%arg2), \TMP5
+ movdqu HashKey_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
@@ -1205,7 +1205,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM5, \TMP6
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa HashKey_4(%arg2), \TMP5
+ movdqu HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
@@ -1224,7 +1224,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
- movdqa HashKey_4_k(%arg2), \TMP5
+ movdqu HashKey_4_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
AESENC \TMP1, \XMM1 # Round 1
@@ -1239,7 +1239,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
- movdqa HashKey_3(%arg2), \TMP5
+ movdqu HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 3
@@ -1252,7 +1252,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
- movdqa HashKey_3_k(%arg2), \TMP5
+ movdqu HashKey_3_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 5
@@ -1266,7 +1266,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM7, \TMP1
pshufd $78, \XMM7, \TMP2
pxor \XMM7, \TMP2
- movdqa HashKey_2(%arg2), \TMP5
+ movdqu HashKey_2(%arg2), \TMP5
# Multiply TMP5 * HashKey using karatsuba
@@ -1282,7 +1282,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM2
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
- movdqa HashKey_2_k(%arg2), \TMP5
+ movdqu HashKey_2_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 8
@@ -1300,7 +1300,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM8, \TMP1
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
- movdqa HashKey(%arg2), \TMP5
+ movdqu HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
AESENC \TMP3, \XMM1 # Round 9
@@ -1329,7 +1329,7 @@ aes_loop_par_dec_done\@:
AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3
AESENCLAST \TMP3, \XMM4
- movdqa HashKey_k(%arg2), \TMP5
+ movdqu HashKey_k(%arg2), \TMP5
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
@@ -1405,10 +1405,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM1, \TMP6
pshufd $78, \XMM1, \TMP2
pxor \XMM1, \TMP2
- movdqa HashKey_4(%arg2), \TMP5
+ movdqu HashKey_4(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
- movdqa HashKey_4_k(%arg2), \TMP4
+ movdqu HashKey_4_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqa \XMM1, \XMMDst
movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
@@ -1418,10 +1418,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM2, \TMP1
pshufd $78, \XMM2, \TMP2
pxor \XMM2, \TMP2
- movdqa HashKey_3(%arg2), \TMP5
+ movdqu HashKey_3(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
- movdqa HashKey_3_k(%arg2), \TMP4
+ movdqu HashKey_3_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM2, \XMMDst
@@ -1433,10 +1433,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM3, \TMP1
pshufd $78, \XMM3, \TMP2
pxor \XMM3, \TMP2
- movdqa HashKey_2(%arg2), \TMP5
+ movdqu HashKey_2(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
- movdqa HashKey_2_k(%arg2), \TMP4
+ movdqu HashKey_2_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM3, \XMMDst
@@ -1446,10 +1446,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
movdqa \XMM4, \TMP1
pshufd $78, \XMM4, \TMP2
pxor \XMM4, \TMP2
- movdqa HashKey(%arg2), \TMP5
+ movdqu HashKey(%arg2), \TMP5
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
- movdqa HashKey_k(%arg2), \TMP4
+ movdqu HashKey_k(%arg2), \TMP4
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM4, \XMMDst
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
index 95cf857d2cbb..f40244eaf14d 100644
--- a/arch/x86/crypto/morus1280-sse2-glue.c
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
static int __init crypto_morus1280_sse2_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
index 615fb7bc9a32..9afaf8f8565a 100644
--- a/arch/x86/crypto/morus640-sse2-glue.c
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
static int __init crypto_morus640_sse2_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 352e70cd33e8..708b46a54578 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -338,7 +338,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef HAVE_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+ STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1
#endif
call enter_from_user_mode
.Lafter_call_\@:
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2767c625a52c..687e47f8a796 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -389,6 +389,13 @@
* that register for the time this macro runs
*/
+ /*
+ * The high bits of the CS dword (__csh) are used for
+ * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
+ * hardware didn't do this for us.
+ */
+ andl $(0x0000ffff), PT_CS(%esp)
+
/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@ -407,12 +414,6 @@
/* Load top of task-stack into %edi */
movl TSS_entry2task_stack(%edi), %edi
- /*
- * Clear unused upper bits of the dword containing the word-sized CS
- * slot in pt_regs in case hardware didn't clear it for us.
- */
- andl $(0x0000ffff), PT_CS(%esp)
-
/* Special case - entry from kernel mode via entry stack */
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
@@ -782,7 +783,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region)
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
@@ -1240,7 +1241,7 @@ ENTRY(spurious_interrupt_bug)
jmp common_exception
END(spurious_interrupt_bug)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
@@ -1321,11 +1322,13 @@ ENTRY(xen_failsafe_callback)
_ASM_EXTABLE(3b, 8b)
_ASM_EXTABLE(4b, 9b)
ENDPROC(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
xen_evtchn_do_upcall)
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 957dfb693ecc..4d7a2d9d44cf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,67 +142,6 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
- .pushsection .entry_trampoline, "ax"
-
-/*
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address). So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
- *
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline. We can thus find cpu_entry_area with this macro:
- */
-
-#define CPU_ENTRY_AREA \
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
- UNWIND_HINT_EMPTY
- swapgs
-
- /* Stash the user RSP. */
- movq %rsp, RSP_SCRATCH
-
- /* Note: using %rsp as a scratch reg. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
- /* Load the top of the task stack into RSP */
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
- /* Start building the simulated IRET frame. */
- pushq $__USER_DS /* pt_regs->ss */
- pushq RSP_SCRATCH /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
-
- /*
- * x86 lacks a near absolute jump, and we can't jump to the real
- * entry text with a relative jump. We could push the target
- * address and then use retq, but this destroys the pipeline on
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
- * spill RDI and restore it in a second-stage trampoline.
- */
- pushq %rdi
- movq $entry_SYSCALL_64_stage2, %rdi
- JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
- .popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
- UNWIND_HINT_EMPTY
- popq %rdi
- jmp entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -212,21 +151,19 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
- /*
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- * is not required to switch CR3.
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER_DS /* pt_regs->ss */
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
- pushq %rax /* pt_regs->orig_ax */
+ pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+/**
+ * idtentry - Generate an IDT entry stub
+ * @sym: Name of the generated entry point
+ * @do_sym: C function to be called
+ * @has_error_code: True if this IDT vector has an error code on the stack
+ * @paranoid: non-zero means that this vector may be invoked from
+ * kernel mode with user GSBASE and/or user CR3.
+ * 2 is special -- see below.
+ * @shift_ist: Set to an IST index if entries from kernel mode should
+ * decrement the IST stack so that nested entries get a
+ * fresh stack. (This is for #DB, which has a nasty habit
+ * of recursing.)
+ *
+ * idtentry generates an IDT stub that sets up a usable kernel context,
+ * creates struct pt_regs, and calls @do_sym. The stub has the following
+ * special behaviors:
+ *
+ * On an entry from user mode, the stub switches from the trampoline or
+ * IST stack to the normal thread stack. On an exit to user mode, the
+ * normal exit-to-usermode path is invoked.
+ *
+ * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+ * whereas we omit the preemption check if @paranoid != 0. This is purely
+ * because the implementation is simpler this way. The kernel only needs
+ * to check for asynchronous kernel preemption when IRQ handlers return.
+ *
+ * If @paranoid == 0, then the stub will handle IRET faults by pretending
+ * that the fault came from user mode. It will handle gs_change faults by
+ * pretending that the fault happened with kernel GSBASE. Since this handling
+ * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+ * @paranoid == 0. This special handling will do the wrong thing for
+ * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ *
+ * @paranoid == 2 is special: the stub will never switch stacks. This is for
+ * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -1050,7 +1023,7 @@ ENTRY(do_softirq_own_stack)
ret
ENDPROC(do_softirq_own_stack)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
@@ -1130,11 +1103,13 @@ ENTRY(xen_failsafe_callback)
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1151,7 +1126,7 @@ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry int3 do_int3 has_error_code=0
idtentry stack_segment do_stack_segment has_error_code=1
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
@@ -1187,6 +1162,16 @@ ENTRY(paranoid_entry)
xorl %ebx, %ebx
1:
+ /*
+ * Always stash CR3 in %r14. This value will be restored,
+ * verbatim, at exit. Needed if paranoid_entry interrupted
+ * another entry that already switched to the user CR3 value
+ * but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ */
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
@@ -1211,11 +1196,13 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
.Lparanoid_exit_restore:
jmp restore_regs_and_return_to_kernel
@@ -1626,6 +1613,7 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index fa3f439f0a92..141d415a8c80 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
- -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS)
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
@@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(obj)/vdso32.so.dbg: FORCE \
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d95c60..007b3fe9d727 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -43,50 +43,26 @@ extern u8 hvclock_page
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
+ asm ("syscall" : "=a" (ret), "=m" (*ts) :
+ "0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
+ "rcx", "r11");
return ret;
}
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
- return ret;
-}
-
-
#else
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
- asm(
+ asm (
"mov %%ebx, %%edx \n"
- "mov %2, %%ebx \n"
+ "mov %[clock], %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
- : "=a" (ret)
- : "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
- : "memory", "edx");
- return ret;
-}
-
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm(
- "mov %%ebx, %%edx \n"
- "mov %2, %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret)
- : "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
- : "memory", "edx");
+ : "=a" (ret), "=m" (*ts)
+ : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
+ : "edx");
return ret;
}
@@ -98,12 +74,11 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
}
-static notrace u64 vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(void)
{
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
- u64 ret;
- u64 last;
u32 version;
+ u64 ret;
/*
* Note: The kernel and hypervisor must guarantee that cpu ID
@@ -130,175 +105,112 @@ static notrace u64 vread_pvclock(int *mode)
do {
version = pvclock_read_begin(pvti);
- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
+ return U64_MAX;
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
- /* refer to vread_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
+ return ret;
}
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
-static notrace u64 vread_hvclock(int *mode)
+static notrace u64 vread_hvclock(void)
{
const struct ms_hyperv_tsc_page *tsc_pg =
(const struct ms_hyperv_tsc_page *)&hvclock_page;
- u64 current_tick = hv_read_tsc_page(tsc_pg);
-
- if (current_tick != U64_MAX)
- return current_tick;
- *mode = VCLOCK_NONE;
- return 0;
+ return hv_read_tsc_page(tsc_pg);
}
#endif
-notrace static u64 vread_tsc(void)
-{
- u64 ret = (u64)rdtsc_ordered();
- u64 last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a function of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-
-notrace static inline u64 vgetsns(int *mode)
+notrace static inline u64 vgetcyc(int mode)
{
- u64 v;
- cycles_t cycles;
-
- if (gtod->vclock_mode == VCLOCK_TSC)
- cycles = vread_tsc();
+ if (mode == VCLOCK_TSC)
+ return (u64)rdtsc_ordered();
#ifdef CONFIG_PARAVIRT_CLOCK
- else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
- cycles = vread_pvclock(mode);
+ else if (mode == VCLOCK_PVCLOCK)
+ return vread_pvclock();
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
- else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
- cycles = vread_hvclock(mode);
+ else if (mode == VCLOCK_HVCLOCK)
+ return vread_hvclock();
#endif
- else
- return 0;
- v = (cycles - gtod->cycle_last) & gtod->mask;
- return v * gtod->mult;
+ return U64_MAX;
}
-/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
-notrace static int __always_inline do_realtime(struct timespec *ts)
+notrace static int do_hres(clockid_t clk, struct timespec *ts)
{
- unsigned long seq;
- u64 ns;
- int mode;
+ struct vgtod_ts *base = &gtod->basetime[clk];
+ u64 cycles, last, sec, ns;
+ unsigned int seq;
do {
seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->wall_time_snsec;
- ns += vgetsns(&mode);
+ cycles = vgetcyc(gtod->vclock_mode);
+ ns = base->nsec;
+ last = gtod->cycle_last;
+ if (unlikely((s64)cycles < 0))
+ return vdso_fallback_gettime(clk, ts);
+ if (cycles > last)
+ ns += (cycles - last) * gtod->mult;
ns >>= gtod->shift;
+ sec = base->sec;
} while (unlikely(gtod_read_retry(gtod, seq)));
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
- return mode;
+ return 0;
}
-notrace static int __always_inline do_monotonic(struct timespec *ts)
+notrace static void do_coarse(clockid_t clk, struct timespec *ts)
{
- unsigned long seq;
- u64 ns;
- int mode;
+ struct vgtod_ts *base = &gtod->basetime[clk];
+ unsigned int seq;
do {
seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
- ns += vgetsns(&mode);
- ns >>= gtod->shift;
+ ts->tv_sec = base->sec;
+ ts->tv_nsec = base->nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
-
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return mode;
}
-notrace static void do_realtime_coarse(struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->wall_time_coarse_sec;
- ts->tv_nsec = gtod->wall_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
+ unsigned int msk;
-notrace static void do_monotonic_coarse(struct timespec *ts)
-{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->monotonic_time_coarse_sec;
- ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
+ /* Sort out negative (CPU/FD) and invalid clocks */
+ if (unlikely((unsigned int) clock >= MAX_CLOCKS))
+ return vdso_fallback_gettime(clock, ts);
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
-{
- switch (clock) {
- case CLOCK_REALTIME:
- if (do_realtime(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_MONOTONIC:
- if (do_monotonic(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_REALTIME_COARSE:
- do_realtime_coarse(ts);
- break;
- case CLOCK_MONOTONIC_COARSE:
- do_monotonic_coarse(ts);
- break;
- default:
- goto fallback;
+ /*
+ * Convert the clockid to a bitmask and use it to check which
+ * clocks are handled in the VDSO directly.
+ */
+ msk = 1U << clock;
+ if (likely(msk & VGTOD_HRES)) {
+ return do_hres(clock, ts);
+ } else if (msk & VGTOD_COARSE) {
+ do_coarse(clock, ts);
+ return 0;
}
-
- return 0;
-fallback:
return vdso_fallback_gettime(clock, ts);
}
+
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (likely(tv != NULL)) {
- if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
- return vdso_fallback_gtod(tv, tz);
+ struct timespec *ts = (struct timespec *) tv;
+
+ do_hres(CLOCK_REALTIME, ts);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
@@ -318,7 +230,7 @@ int gettimeofday(struct timeval *, struct timezone *)
notrace time_t __vdso_time(time_t *t)
{
/* This is atomic on x86 so we don't need any locks. */
- time_t result = READ_ONCE(gtod->wall_time_sec);
+ time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec);
if (t)
*t = result;
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4ce9a..f86ab0ae1777 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -13,14 +13,8 @@
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
- unsigned int p;
+ vdso_read_cpunode(cpu, node);
- p = __getcpu();
-
- if (cpu)
- *cpu = p & VGETCPU_CPU_MASK;
- if (node)
- *node = p >> 12;
return 0;
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556dbb12..3f9d43f26f63 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -332,40 +332,6 @@ static __init int vdso_setup(char *s)
return 0;
}
__setup("vdso=", vdso_setup);
-#endif
-
-#ifdef CONFIG_X86_64
-static void vgetcpu_cpu_init(void *arg)
-{
- int cpu = smp_processor_id();
- struct desc_struct d = { };
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /*
- * Store cpu number in limit so that it can be loaded
- * quickly in user space in vgetcpu. (12 bits for the CPU
- * and 8 bits for the node)
- */
- d.limit0 = cpu | ((node & 0xf) << 12);
- d.limit1 = node >> 4;
- d.type = 5; /* RO data, expand down, accessed */
- d.dpl = 3; /* Visible to user code */
- d.s = 1; /* Not a system segment */
- d.p = 1; /* Present */
- d.d = 1; /* 32-bit */
-
- write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static int vgetcpu_online(unsigned int cpu)
-{
- return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
-}
static int __init init_vdso(void)
{
@@ -375,9 +341,7 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
#endif
- /* notifier priority > KVM */
- return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
- "x86/vdso/vma:online", vgetcpu_online, NULL);
+ return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 82ed001e8909..85fd85d52ffd 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -100,20 +100,13 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
*/
if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
- siginfo_t info;
struct thread_struct *thread = &current->thread;
thread->error_code = 6; /* user fault, no page, write */
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
- clear_siginfo(&info);
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *)ptr;
-
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current);
return false;
} else {
return true;
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index e1216dd95c04..cfcdba082feb 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *tk)
{
int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+ struct vgtod_ts *base;
+ u64 nsec;
/* Mark the new vclock used. */
BUILD_BUG_ON(VCLOCK_MAX >= 32);
@@ -45,34 +47,37 @@ void update_vsyscall(struct timekeeper *tk)
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
- vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
+ base = &vdata->basetime[CLOCK_REALTIME];
+ base->sec = tk->xtime_sec;
+ base->nsec = tk->tkr_mono.xtime_nsec;
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
- + ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr_mono.shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
- vdata->monotonic_time_sec++;
- }
+ base = &vdata->basetime[CLOCK_TAI];
+ base->sec = tk->xtime_sec + (s64)tk->tai_offset;
+ base->nsec = tk->tkr_mono.xtime_nsec;
- vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
- tk->tkr_mono.shift);
+ base = &vdata->basetime[CLOCK_MONOTONIC];
+ base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
+ base->sec++;
+ }
+ base->nsec = nsec;
- vdata->monotonic_time_coarse_sec =
- vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_coarse_nsec =
- vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
+ base = &vdata->basetime[CLOCK_REALTIME_COARSE];
+ base->sec = tk->xtime_sec;
+ base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
- while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
- vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
- vdata->monotonic_time_coarse_sec++;
+ base = &vdata->basetime[CLOCK_MONOTONIC_COARSE];
+ base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->wall_to_monotonic.tv_nsec;
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ base->sec++;
}
+ base->nsec = nsec;
gtod_write_end(vdata);
}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index c84584bb9402..7d2d7c801dba 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -669,6 +669,10 @@ static int __init amd_core_pmu_init(void)
* We fallback to using default amd_get_event_constraints.
*/
break;
+ case 0x18:
+ pr_cont("Fam18h ");
+ /* Using default amd_get_event_constraints. */
+ break;
default:
pr_err("core perfctr but no constraints; unknown hardware!\n");
return -ENODEV;
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 981ba5e8241b..398df6eaa109 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -36,6 +36,7 @@
static int num_counters_llc;
static int num_counters_nb;
+static bool l3_mask;
static HLIST_HEAD(uncore_unused_list);
@@ -209,6 +210,13 @@ static int amd_uncore_event_init(struct perf_event *event)
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
+ /*
+ * SliceMask and ThreadMask need to be set for certain L3 events in
+ * Family 17h. For other events, the two fields do not affect the count.
+ */
+ if (l3_mask)
+ hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK);
+
if (event->cpu < 0)
return -EINVAL;
@@ -507,17 +515,19 @@ static int __init amd_uncore_init(void)
{
int ret = -ENODEV;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return -ENODEV;
if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
return -ENODEV;
- if (boot_cpu_data.x86 == 0x17) {
+ if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) {
/*
- * For F17h, the Northbridge counters are repurposed as Data
- * Fabric counters. Also, L3 counters are supported too. The PMUs
- * are exported based on family as either L2 or L3 and NB or DF.
+ * For F17h or F18h, the Northbridge counters are
+ * repurposed as Data Fabric counters. Also, L3
+ * counters are supported too. The PMUs are exported
+ * based on family as either L2 or L3 and NB or DF.
*/
num_counters_nb = NUM_COUNTERS_NB;
num_counters_llc = NUM_COUNTERS_L3;
@@ -525,6 +535,7 @@ static int __init amd_uncore_init(void)
amd_llc_pmu.name = "amd_l3";
format_attr_event_df.show = &event_show_df;
format_attr_event_l3.show = &event_show_l3;
+ l3_mask = true;
} else {
num_counters_nb = NUM_COUNTERS_NB;
num_counters_llc = NUM_COUNTERS_L2;
@@ -532,6 +543,7 @@ static int __init amd_uncore_init(void)
amd_llc_pmu.name = "amd_l2";
format_attr_event_df = format_attr_event;
format_attr_event_l3 = format_attr_event;
+ l3_mask = false;
}
amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
@@ -547,7 +559,9 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_nb;
- pr_info("AMD NB counters detected\n");
+ pr_info("%s NB counters detected\n",
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?
+ "HYGON" : "AMD");
ret = 0;
}
@@ -561,7 +575,9 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_llc;
- pr_info("AMD LLC counters detected\n");
+ pr_info("%s LLC counters detected\n",
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?
+ "HYGON" : "AMD");
ret = 0;
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..106911b603bd 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1033,6 +1033,27 @@ static inline void x86_assign_hw_event(struct perf_event *event,
}
}
+/**
+ * x86_perf_rdpmc_index - Return PMC counter used for event
+ * @event: the perf_event to which the PMC counter was assigned
+ *
+ * The counter assigned to this performance event may change if interrupts
+ * are enabled. This counter should thus never be used while interrupts are
+ * enabled. Before this function is used to obtain the assigned counter the
+ * event should be checked for validity using, for example,
+ * perf_event_read_local(), within the same interrupt disabled section in
+ * which this counter is planned to be used.
+ *
+ * Return: The index of the performance monitoring counter assigned to
+ * @perf_event.
+ */
+int x86_perf_rdpmc_index(struct perf_event *event)
+{
+ lockdep_assert_irqs_disabled();
+
+ return event->hw.event_base_rdpmc;
+}
+
static inline int match_prev_assignment(struct hw_perf_event *hwc,
struct cpu_hw_events *cpuc,
int i)
@@ -1584,7 +1605,7 @@ static void __init pmu_check_apic(void)
}
-static struct attribute_group x86_pmu_format_group = {
+static struct attribute_group x86_pmu_format_group __ro_after_init = {
.name = "format",
.attrs = NULL,
};
@@ -1631,9 +1652,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
struct attribute **new;
int j, i;
- for (j = 0; a[j]; j++)
+ for (j = 0; a && a[j]; j++)
;
- for (i = 0; b[i]; i++)
+ for (i = 0; b && b[i]; i++)
j++;
j++;
@@ -1642,9 +1663,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
return NULL;
j = 0;
- for (i = 0; a[i]; i++)
+ for (i = 0; a && a[i]; i++)
new[j++] = a[i];
- for (i = 0; b[i]; i++)
+ for (i = 0; b && b[i]; i++)
new[j++] = b[i];
new[j] = NULL;
@@ -1715,7 +1736,7 @@ static struct attribute *events_attr[] = {
NULL,
};
-static struct attribute_group x86_pmu_events_group = {
+static struct attribute_group x86_pmu_events_group __ro_after_init = {
.name = "events",
.attrs = events_attr,
};
@@ -1776,6 +1797,10 @@ static int __init init_hw_perf_events(void)
case X86_VENDOR_AMD:
err = amd_pmu_init();
break;
+ case X86_VENDOR_HYGON:
+ err = amd_pmu_init();
+ x86_pmu.name = "HYGON";
+ break;
default:
err = -ENOTSUPP;
}
@@ -2230,7 +2255,7 @@ static struct attribute *x86_pmu_attrs[] = {
NULL,
};
-static struct attribute_group x86_pmu_attr_group = {
+static struct attribute_group x86_pmu_attr_group __ro_after_init = {
.attrs = x86_pmu_attrs,
};
@@ -2248,7 +2273,7 @@ static struct attribute *x86_pmu_caps_attrs[] = {
NULL
};
-static struct attribute_group x86_pmu_caps_group = {
+static struct attribute_group x86_pmu_caps_group __ro_after_init = {
.name = "caps",
.attrs = x86_pmu_caps_attrs,
};
@@ -2465,7 +2490,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
perf_callchain_store(entry, regs->ip);
- if (!current->mm)
+ if (!nmi_uaccess_okay())
return;
if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 035c37481f57..0fb8659b20d8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -242,7 +242,7 @@ EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
-static struct attribute *nhm_events_attrs[] = {
+static struct attribute *nhm_mem_events_attrs[] = {
EVENT_PTR(mem_ld_nhm),
NULL,
};
@@ -278,8 +278,6 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
static struct attribute *snb_events_attrs[] = {
- EVENT_PTR(mem_ld_snb),
- EVENT_PTR(mem_st_snb),
EVENT_PTR(td_slots_issued),
EVENT_PTR(td_slots_retired),
EVENT_PTR(td_fetch_bubbles),
@@ -290,6 +288,12 @@ static struct attribute *snb_events_attrs[] = {
NULL,
};
+static struct attribute *snb_mem_events_attrs[] = {
+ EVENT_PTR(mem_ld_snb),
+ EVENT_PTR(mem_st_snb),
+ NULL,
+};
+
static struct event_constraint intel_hsw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -1995,6 +1999,18 @@ static void intel_pmu_nhm_enable_all(int added)
intel_pmu_enable_all(added);
}
+static void enable_counter_freeze(void)
+{
+ update_debugctlmsr(get_debugctlmsr() |
+ DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
+static void disable_counter_freeze(void)
+{
+ update_debugctlmsr(get_debugctlmsr() &
+ ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -2200,59 +2216,15 @@ static void intel_pmu_reset(void)
local_irq_restore(flags);
}
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
+static int handle_pmi_common(struct pt_regs *regs, u64 status)
{
struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int bit, loops;
- u64 status;
- int handled;
- int pmu_enabled;
-
- cpuc = this_cpu_ptr(&cpu_hw_events);
-
- /*
- * Save the PMU state.
- * It needs to be restored when leaving the handler.
- */
- pmu_enabled = cpuc->enabled;
- /*
- * No known reason to not always do late ACK,
- * but just in case do it opt-in.
- */
- if (!x86_pmu.late_ack)
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_bts_disable_local();
- cpuc->enabled = 0;
- __intel_pmu_disable_all();
- handled = intel_pmu_drain_bts_buffer();
- handled += intel_bts_interrupt();
- status = intel_pmu_get_status();
- if (!status)
- goto done;
-
- loops = 0;
-again:
- intel_pmu_lbr_read();
- intel_pmu_ack_status(status);
- if (++loops > 100) {
- static bool warned = false;
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int bit;
+ int handled = 0;
inc_irq_stat(apic_perf_irqs);
-
/*
* Ignore a range of extra bits in status that do not indicate
* overflow by themselves.
@@ -2261,7 +2233,7 @@ again:
GLOBAL_STATUS_ASIF |
GLOBAL_STATUS_LBRS_FROZEN);
if (!status)
- goto done;
+ return 0;
/*
* In case multiple PEBS events are sampled at the same time,
* it is possible to have GLOBAL_STATUS bit 62 set indicating
@@ -2331,6 +2303,146 @@ again:
x86_pmu_stop(event, 0);
}
+ return handled;
+}
+
+static bool disable_counter_freezing;
+static int __init intel_perf_counter_freezing_setup(char *s)
+{
+ disable_counter_freezing = true;
+ pr_info("Intel PMU Counter freezing feature disabled\n");
+ return 1;
+}
+__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+
+/*
+ * Simplified handler for Arch Perfmon v4:
+ * - We rely on counter freezing/unfreezing to enable/disable the PMU.
+ * This is done automatically on PMU ack.
+ * - Ack the PMU only after the APIC.
+ */
+
+static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int handled = 0;
+ bool bts = false;
+ u64 status;
+ int pmu_enabled = cpuc->enabled;
+ int loops = 0;
+
+ /* PMU has been disabled because of counter freezing */
+ cpuc->enabled = 0;
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ bts = true;
+ intel_bts_disable_local();
+ handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
+ }
+ status = intel_pmu_get_status();
+ if (!status)
+ goto done;
+again:
+ intel_pmu_lbr_read();
+ if (++loops > 100) {
+ static bool warned;
+
+ if (!warned) {
+ WARN(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ warned = true;
+ }
+ intel_pmu_reset();
+ goto done;
+ }
+
+
+ handled += handle_pmi_common(regs, status);
+done:
+ /* Ack the PMI in the APIC */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ /*
+ * The counters start counting immediately while ack the status.
+ * Make it as close as possible to IRET. This avoids bogus
+ * freezing on Skylake CPUs.
+ */
+ if (status) {
+ intel_pmu_ack_status(status);
+ } else {
+ /*
+ * CPU may issues two PMIs very close to each other.
+ * When the PMI handler services the first one, the
+ * GLOBAL_STATUS is already updated to reflect both.
+ * When it IRETs, the second PMI is immediately
+ * handled and it sees clear status. At the meantime,
+ * there may be a third PMI, because the freezing bit
+ * isn't set since the ack in first PMI handlers.
+ * Double check if there is more work to be done.
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+ }
+
+ if (bts)
+ intel_bts_enable_local();
+ cpuc->enabled = pmu_enabled;
+ return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc;
+ int loops;
+ u64 status;
+ int handled;
+ int pmu_enabled;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ /*
+ * No known reason to not always do late ACK,
+ * but just in case do it opt-in.
+ */
+ if (!x86_pmu.late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ intel_bts_disable_local();
+ cpuc->enabled = 0;
+ __intel_pmu_disable_all();
+ handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
+ status = intel_pmu_get_status();
+ if (!status)
+ goto done;
+
+ loops = 0;
+again:
+ intel_pmu_lbr_read();
+ intel_pmu_ack_status(status);
+ if (++loops > 100) {
+ static bool warned;
+
+ if (!warned) {
+ WARN(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ warned = true;
+ }
+ intel_pmu_reset();
+ goto done;
+ }
+
+ handled += handle_pmi_common(regs, status);
+
/*
* Repeat if there is more work to be done:
*/
@@ -3350,6 +3462,9 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+ if (x86_pmu.counter_freezing)
+ enable_counter_freeze();
+
if (!cpuc->shared_regs)
return;
@@ -3421,6 +3536,9 @@ static void intel_pmu_cpu_dying(int cpu)
free_excl_cntrs(cpu);
fini_debug_store_on_cpu(cpu);
+
+ if (x86_pmu.counter_freezing)
+ disable_counter_freeze();
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3725,6 +3843,40 @@ static __init void intel_nehalem_quirk(void)
}
}
+static bool intel_glp_counter_freezing_broken(int cpu)
+{
+ u32 rev = UINT_MAX; /* default to broken for unknown stepping */
+
+ switch (cpu_data(cpu).x86_stepping) {
+ case 1:
+ rev = 0x28;
+ break;
+ case 8:
+ rev = 0x6;
+ break;
+ }
+
+ return (cpu_data(cpu).microcode < rev);
+}
+
+static __init void intel_glp_counter_freezing_quirk(void)
+{
+ /* Check if it's already disabled */
+ if (disable_counter_freezing)
+ return;
+
+ /*
+ * If the system starts with the wrong ucode, leave the
+ * counter-freezing feature permanently disabled.
+ */
+ if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) {
+ pr_info("PMU counter freezing disabled due to CPU errata,"
+ "please upgrade microcode\n");
+ x86_pmu.counter_freezing = false;
+ x86_pmu.handle_irq = intel_pmu_handle_irq;
+ }
+}
+
/*
* enable software workaround for errata:
* SNB: BJ122
@@ -3764,8 +3916,6 @@ EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
static struct attribute *hsw_events_attrs[] = {
- EVENT_PTR(mem_ld_hsw),
- EVENT_PTR(mem_st_hsw),
EVENT_PTR(td_slots_issued),
EVENT_PTR(td_slots_retired),
EVENT_PTR(td_fetch_bubbles),
@@ -3776,6 +3926,12 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static struct attribute *hsw_mem_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ NULL,
+};
+
static struct attribute *hsw_tsx_events_attrs[] = {
EVENT_PTR(tx_start),
EVENT_PTR(tx_commit),
@@ -3792,13 +3948,6 @@ static struct attribute *hsw_tsx_events_attrs[] = {
NULL
};
-static __init struct attribute **get_hsw_events_attrs(void)
-{
- return boot_cpu_has(X86_FEATURE_RTM) ?
- merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) :
- hsw_events_attrs;
-}
-
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -3875,9 +4024,32 @@ static struct attribute *intel_pmu_attrs[] = {
NULL,
};
+static __init struct attribute **
+get_events_attrs(struct attribute **base,
+ struct attribute **mem,
+ struct attribute **tsx)
+{
+ struct attribute **attrs = base;
+ struct attribute **old;
+
+ if (mem && x86_pmu.pebs)
+ attrs = merge_attr(attrs, mem);
+
+ if (tsx && boot_cpu_has(X86_FEATURE_RTM)) {
+ old = attrs;
+ attrs = merge_attr(attrs, tsx);
+ if (old != base)
+ kfree(old);
+ }
+
+ return attrs;
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_attr = NULL;
+ struct attribute **mem_attr = NULL;
+ struct attribute **tsx_attr = NULL;
struct attribute **to_free = NULL;
union cpuid10_edx edx;
union cpuid10_eax eax;
@@ -3935,6 +4107,9 @@ __init int intel_pmu_init(void)
max((int)edx.split.num_counters_fixed, assume);
}
+ if (version >= 4)
+ x86_pmu.counter_freezing = !disable_counter_freezing;
+
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -3986,7 +4161,7 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
- x86_pmu.cpu_events = nhm_events_attrs;
+ mem_attr = nhm_mem_events_attrs;
/* UOPS_ISSUED.STALLED_CYCLES */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
@@ -4004,11 +4179,11 @@ __init int intel_pmu_init(void)
name = "nehalem";
break;
- case INTEL_FAM6_ATOM_PINEVIEW:
- case INTEL_FAM6_ATOM_LINCROFT:
- case INTEL_FAM6_ATOM_PENWELL:
- case INTEL_FAM6_ATOM_CLOVERVIEW:
- case INTEL_FAM6_ATOM_CEDARVIEW:
+ case INTEL_FAM6_ATOM_BONNELL:
+ case INTEL_FAM6_ATOM_BONNELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL:
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -4021,9 +4196,11 @@ __init int intel_pmu_init(void)
name = "bonnell";
break;
- case INTEL_FAM6_ATOM_SILVERMONT1:
- case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
case INTEL_FAM6_ATOM_AIRMONT:
+ case INTEL_FAM6_ATOM_AIRMONT_MID:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -4042,7 +4219,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_DENVERTON:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -4068,7 +4245,8 @@ __init int intel_pmu_init(void)
name = "goldmont";
break;
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ x86_add_quirk(intel_glp_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -4112,7 +4290,7 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_westmere_extra_regs;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
- x86_pmu.cpu_events = nhm_events_attrs;
+ mem_attr = nhm_mem_events_attrs;
/* UOPS_ISSUED.STALLED_CYCLES */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
@@ -4152,6 +4330,7 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
+ mem_attr = snb_mem_events_attrs;
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
@@ -4192,6 +4371,7 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
+ mem_attr = snb_mem_events_attrs;
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
@@ -4226,10 +4406,12 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.cpu_events = get_hsw_events_attrs();
+ x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
pr_cont("Haswell events, ");
name = "haswell";
break;
@@ -4265,10 +4447,12 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.cpu_events = get_hsw_events_attrs();
+ x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.limit_period = bdw_limit_period;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
pr_cont("Broadwell events, ");
name = "broadwell";
break;
@@ -4324,7 +4508,9 @@ __init int intel_pmu_init(void)
hsw_format_attr : nhm_format_attr;
extra_attr = merge_attr(extra_attr, skl_format_attr);
to_free = extra_attr;
- x86_pmu.cpu_events = get_hsw_events_attrs();
+ x86_pmu.cpu_events = hsw_events_attrs;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
@@ -4357,6 +4543,9 @@ __init int intel_pmu_init(void)
WARN_ON(!x86_pmu.format_attrs);
}
+ x86_pmu.cpu_events = get_events_attrs(x86_pmu.cpu_events,
+ mem_attr, tsx_attr);
+
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4431,6 +4620,13 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
+ /*
+ * For arch perfmon 4 use counter freezing to avoid
+ * several MSR accesses in the PMI.
+ */
+ if (x86_pmu.counter_freezing)
+ x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
+
kfree(to_free);
return 0;
}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9f8084f18d58..d2e780705c5a 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -559,8 +559,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
@@ -581,9 +581,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f3e006bed9a7..c88ed39582a1 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1272,4 +1272,8 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /* Knights Landing does have MISPREDICT bit */
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP)
+ x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8d016ce5b80d..3a0aa83cbd07 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -95,7 +95,7 @@ static ssize_t pt_cap_show(struct device *cdev,
return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
}
-static struct attribute_group pt_cap_group = {
+static struct attribute_group pt_cap_group __ro_after_init = {
.name = "caps",
};
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 32f3e9423e99..91039ffed633 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -777,9 +777,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
- X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),
- X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
{},
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 51d7c117e3c7..c07bee31abe8 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3061,7 +3061,7 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
void bdx_uncore_cpu_init(void)
{
- int pkg = topology_phys_to_logical_pkg(0);
+ int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id);
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
@@ -3931,16 +3931,16 @@ static const struct pci_device_id skx_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
},
{ /* M3UPI0 Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
- .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0),
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0),
},
{ /* M3UPI0 Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
- .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1),
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1),
},
{ /* M3UPI1 Link 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
- .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2),
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2),
},
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index b4771a6ddbc1..1b9f85abf9bc 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -69,14 +69,14 @@ static bool test_intel(int idx)
case INTEL_FAM6_BROADWELL_GT3E:
case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_ATOM_SILVERMONT1:
- case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
case INTEL_FAM6_ATOM_AIRMONT:
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_DENVERTON:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 156286335351..adae087cecdd 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -560,9 +560,11 @@ struct x86_pmu {
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
- bool late_ack;
u64 (*limit_period)(struct perf_event *event, u64 l);
+ /* PMI handler bits */
+ unsigned int late_ack :1,
+ counter_freezing :1;
/*
* sysfs attrs
*/
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index b21ee65c4101..1c11f9420a82 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,2 +1,6 @@
obj-y := hv_init.o mmu.o nested.o
obj-$(CONFIG_X86_64) += hv_apic.o
+
+ifdef CONFIG_X86_64
+obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
+endif
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5b0f613428c2..8eb6fbee8e13 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -20,7 +20,6 @@
*/
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
@@ -95,8 +94,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
*/
static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
{
- struct ipi_arg_ex **arg;
- struct ipi_arg_ex *ipi_arg;
+ struct hv_send_ipi_ex **arg;
+ struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
int ret = 1;
@@ -105,7 +104,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
return false;
local_irq_save(flags);
- arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
ipi_arg = *arg;
if (unlikely(!ipi_arg))
@@ -135,7 +134,7 @@ ipi_mask_ex_done:
static bool __send_ipi_mask(const struct cpumask *mask, int vector)
{
int cur_cpu, vcpu;
- struct ipi_arg_non_ex ipi_arg;
+ struct hv_send_ipi ipi_arg;
int ret = 1;
trace_hyperv_send_ipi_mask(mask, vector);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 20c876c7c5bf..7abb09e2eeb8 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -17,6 +17,7 @@
*
*/
+#include <linux/efi.h>
#include <linux/types.h>
#include <asm/apic.h>
#include <asm/desc.h>
@@ -253,6 +254,22 @@ static int hv_cpu_die(unsigned int cpu)
return 0;
}
+static int __init hv_pci_init(void)
+{
+ int gen2vm = efi_enabled(EFI_BOOT);
+
+ /*
+ * For Generation-2 VM, we exit from pci_arch_init() by returning 0.
+ * The purpose is to suppress the harmless warning:
+ * "PCI: Fatal: No config space access function found"
+ */
+ if (gen2vm)
+ return 0;
+
+ /* For Generation-1 VM, we'll proceed in pci_arch_init(). */
+ return 1;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -329,6 +346,8 @@ void __init hyperv_init(void)
hv_apic_init();
+ x86_init.pci.arch_init = hv_pci_init;
+
/*
* Register Hyper-V specific clocksource.
*/
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
new file mode 100644
index 000000000000..a861b0456b1a
--- /dev/null
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific spinlock code.
+ *
+ * Copyright (C) 2018, Intel, Inc.
+ *
+ * Author : Yi Sun <yi.y.sun@intel.com>
+ */
+
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+#include <linux/spinlock.h>
+
+#include <asm/mshyperv.h>
+#include <asm/paravirt.h>
+#include <asm/apic.h>
+
+static bool __initdata hv_pvspin = true;
+
+static void hv_qlock_kick(int cpu)
+{
+ apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR);
+}
+
+static void hv_qlock_wait(u8 *byte, u8 val)
+{
+ unsigned long msr_val;
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ /*
+ * Reading HV_X64_MSR_GUEST_IDLE MSR tells the hypervisor that the
+ * vCPU can be put into 'idle' state. This 'idle' state is
+ * terminated by an IPI, usually from hv_qlock_kick(), even if
+ * interrupts are disabled on the vCPU.
+ *
+ * To prevent a race against the unlock path it is required to
+ * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE
+ * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between
+ * the lock value check and the rdmsrl() then the vCPU might be put
+ * into 'idle' state by the hypervisor and kept in that state for
+ * an unspecified amount of time.
+ */
+ local_irq_save(flags);
+ /*
+ * Only issue the rdmsrl() when the lock state has not changed.
+ */
+ if (READ_ONCE(*byte) == val)
+ rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val);
+ local_irq_restore(flags);
+}
+
+/*
+ * Hyper-V does not support this so far.
+ */
+bool hv_vcpu_is_preempted(int vcpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted);
+
+void __init hv_init_spinlocks(void)
+{
+ if (!hv_pvspin || !apic ||
+ !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) ||
+ !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) {
+ pr_info("PV spinlocks disabled\n");
+ return;
+ }
+ pr_info("PV spinlocks enabled\n");
+
+ __pv_init_lock_hash();
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = hv_qlock_wait;
+ pv_ops.lock.kick = hv_qlock_kick;
+ pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
+}
+
+static __init int hv_parse_nopvspin(char *arg)
+{
+ hv_pvspin = false;
+ return 0;
+}
+early_param("hv_nopvspin", hv_parse_nopvspin);
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index ef5f29f913d7..e65d7fe6489f 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -231,6 +231,6 @@ void hyperv_setup_mmu_ops(void)
return;
pr_info("Using hypercall for remote TLB flush\n");
- pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
- pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+ pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index a303d7b7d763..2f01eb4d6208 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -142,6 +142,8 @@ static inline u64 acpi_arch_get_root_pointer(void)
void acpi_generic_reduced_hw_init(void);
+u64 x86_default_get_root_pointer(void);
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
@@ -153,6 +155,11 @@ static inline void disable_acpi(void) { }
static inline void acpi_generic_reduced_hw_init(void) { }
+static inline u64 x86_default_get_root_pointer(void)
+{
+ return 0;
+}
+
#endif /* !CONFIG_ACPI */
#define ARCH_HAS_POWER_INIT 1
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 31b627b43a8e..8e4ea39e55d0 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -7,16 +7,24 @@
#include <asm/asm.h>
#ifdef CONFIG_SMP
- .macro LOCK_PREFIX
-672: lock
+.macro LOCK_PREFIX_HERE
.pushsection .smp_locks,"a"
.balign 4
- .long 672b - .
+ .long 671f - . # offset
.popsection
- .endm
+671:
+.endm
+
+.macro LOCK_PREFIX insn:vararg
+ LOCK_PREFIX_HERE
+ lock \insn
+.endm
#else
- .macro LOCK_PREFIX
- .endm
+.macro LOCK_PREFIX_HERE
+.endm
+
+.macro LOCK_PREFIX insn:vararg
+.endm
#endif
/*
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4cd6a3b71824..d7faa16622d8 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -31,15 +31,8 @@
*/
#ifdef CONFIG_SMP
-#define LOCK_PREFIX_HERE \
- ".pushsection .smp_locks,\"a\"\n" \
- ".balign 4\n" \
- ".long 671f - .\n" /* offset */ \
- ".popsection\n" \
- "671:"
-
-#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
-
+#define LOCK_PREFIX_HERE "LOCK_PREFIX_HERE\n\t"
+#define LOCK_PREFIX "LOCK_PREFIX "
#else /* ! CONFIG_SMP */
#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index fddb6d26239f..1ae4e5791afa 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -103,6 +103,9 @@ static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
static inline bool amd_gart_present(void)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return false;
+
/* GART present only on Fam15h, upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
(boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 990770f9e76b..21b086786404 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -120,16 +120,32 @@
/* Exception table entry */
#ifdef __ASSEMBLY__
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
- .pushsection "__ex_table","a" ; \
- .balign 4 ; \
- .long (from) - . ; \
- .long (to) - . ; \
- .long (handler) - . ; \
+ ASM_EXTABLE_HANDLE from to handler
+
+.macro ASM_EXTABLE_HANDLE from:req to:req handler:req
+ .pushsection "__ex_table","a"
+ .balign 4
+ .long (\from) - .
+ .long (\to) - .
+ .long (\handler) - .
.popsection
+.endm
+#else /* __ASSEMBLY__ */
+
+# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+ "ASM_EXTABLE_HANDLE from=" #from " to=" #to \
+ " handler=\"" #handler "\"\n\t"
+
+/* For C file, we already have NOKPROBE_SYMBOL macro */
+
+#endif /* __ASSEMBLY__ */
# define _ASM_EXTABLE(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+# define _ASM_EXTABLE_UA(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
+
# define _ASM_EXTABLE_FAULT(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
@@ -145,6 +161,7 @@
_ASM_PTR (entry); \
.popsection
+#ifdef __ASSEMBLY__
.macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
@@ -165,34 +182,10 @@
jmp copy_user_handle_tail
.previous
- _ASM_EXTABLE(100b,103b)
- _ASM_EXTABLE(101b,103b)
+ _ASM_EXTABLE_UA(100b, 103b)
+ _ASM_EXTABLE_UA(101b, 103b)
.endm
-
-#else
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler) \
- " .pushsection \"__ex_table\",\"a\"\n" \
- " .balign 4\n" \
- " .long (" #from ") - .\n" \
- " .long (" #to ") - .\n" \
- " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \
- " .popsection\n"
-
-# define _ASM_EXTABLE(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_FAULT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
-# define _ASM_EXTABLE_EX(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
-
-# define _ASM_EXTABLE_REFCOUNT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
-
-/* For C file, we already have NOKPROBE_SYMBOL macro */
-#endif
+#endif /* __ASSEMBLY__ */
#ifndef __ASSEMBLY__
/*
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b143717b92b3..ea3d95275b43 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -80,11 +80,11 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-#define arch_atomic_sub_and_test arch_atomic_sub_and_test
static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
/**
* arch_atomic_inc - increment atomic variable
@@ -92,12 +92,12 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
*
* Atomically increments @v by 1.
*/
-#define arch_atomic_inc arch_atomic_inc
static __always_inline void arch_atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter));
}
+#define arch_atomic_inc arch_atomic_inc
/**
* arch_atomic_dec - decrement atomic variable
@@ -105,12 +105,12 @@ static __always_inline void arch_atomic_inc(atomic_t *v)
*
* Atomically decrements @v by 1.
*/
-#define arch_atomic_dec arch_atomic_dec
static __always_inline void arch_atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0"
: "+m" (v->counter));
}
+#define arch_atomic_dec arch_atomic_dec
/**
* arch_atomic_dec_and_test - decrement and test
@@ -120,11 +120,11 @@ static __always_inline void arch_atomic_dec(atomic_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-#define arch_atomic_dec_and_test arch_atomic_dec_and_test
static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
/**
* arch_atomic_inc_and_test - increment and test
@@ -134,11 +134,11 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-#define arch_atomic_inc_and_test arch_atomic_inc_and_test
static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
/**
* arch_atomic_add_negative - add and test if negative
@@ -149,11 +149,11 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-#define arch_atomic_add_negative arch_atomic_add_negative
static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
+#define arch_atomic_add_negative arch_atomic_add_negative
/**
* arch_atomic_add_return - add integer and return
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index ef959f02d070..6a5b0ec460da 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -205,12 +205,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-#define arch_atomic64_inc arch_atomic64_inc
static inline void arch_atomic64_inc(atomic64_t *v)
{
__alternative_atomic64(inc, inc_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
}
+#define arch_atomic64_inc arch_atomic64_inc
/**
* arch_atomic64_dec - decrement atomic64 variable
@@ -218,12 +218,12 @@ static inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-#define arch_atomic64_dec arch_atomic64_dec
static inline void arch_atomic64_dec(atomic64_t *v)
{
__alternative_atomic64(dec, dec_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
}
+#define arch_atomic64_dec arch_atomic64_dec
/**
* arch_atomic64_add_unless - add unless the number is a given value
@@ -245,7 +245,6 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a,
return (int)a;
}
-#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
{
int r;
@@ -253,8 +252,8 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
"S" (v) : "ecx", "edx", "memory");
return r;
}
+#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
-#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
static inline long long arch_atomic64_dec_if_positive(atomic64_t *v)
{
long long r;
@@ -262,6 +261,7 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v)
"S" (v) : "ecx", "memory");
return r;
}
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
#undef alternative_atomic64
#undef __alternative_atomic64
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 4343d9b4f30e..dadc20adba21 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -71,11 +71,11 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
/**
* arch_atomic64_inc - increment atomic64 variable
@@ -83,13 +83,13 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-#define arch_atomic64_inc arch_atomic64_inc
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
: "m" (v->counter));
}
+#define arch_atomic64_inc arch_atomic64_inc
/**
* arch_atomic64_dec - decrement atomic64 variable
@@ -97,13 +97,13 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-#define arch_atomic64_dec arch_atomic64_dec
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
: "m" (v->counter));
}
+#define arch_atomic64_dec arch_atomic64_dec
/**
* arch_atomic64_dec_and_test - decrement and test
@@ -113,11 +113,11 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
/**
* arch_atomic64_inc_and_test - increment and test
@@ -127,11 +127,11 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
/**
* arch_atomic64_add_negative - add and test if negative
@@ -142,11 +142,11 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-#define arch_atomic64_add_negative arch_atomic64_add_negative
static inline bool arch_atomic64_add_negative(long i, atomic64_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
+#define arch_atomic64_add_negative arch_atomic64_add_negative
/**
* arch_atomic64_add_return - add and return
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9f645ba57dbb..124f9195eb3e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -217,8 +217,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr)
*/
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts),
- *addr, "Ir", nr, "%0", c);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}
/**
@@ -264,8 +263,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
*/
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr),
- *addr, "Ir", nr, "%0", c);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}
/**
@@ -318,8 +316,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
*/
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc),
- *addr, "Ir", nr, "%0", c);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 6804d6642767..5090035e6d16 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -4,6 +4,8 @@
#include <linux/stringify.h>
+#ifndef __ASSEMBLY__
+
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*
@@ -20,53 +22,15 @@
#define LEN_UD2 2
-#ifdef CONFIG_GENERIC_BUG
-
-#ifdef CONFIG_X86_32
-# define __BUG_REL(val) ".long " __stringify(val)
-#else
-# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
-#endif
-
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-
-#define _BUG_FLAGS(ins, flags) \
-do { \
- asm volatile("1:\t" ins "\n" \
- ".pushsection __bug_table,\"aw\"\n" \
- "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
- "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \
- "\t.word %c1" "\t# bug_entry::line\n" \
- "\t.word %c2" "\t# bug_entry::flags\n" \
- "\t.org 2b+%c3\n" \
- ".popsection" \
- : : "i" (__FILE__), "i" (__LINE__), \
- "i" (flags), \
- "i" (sizeof(struct bug_entry))); \
-} while (0)
-
-#else /* !CONFIG_DEBUG_BUGVERBOSE */
-
#define _BUG_FLAGS(ins, flags) \
do { \
- asm volatile("1:\t" ins "\n" \
- ".pushsection __bug_table,\"aw\"\n" \
- "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
- "\t.word %c0" "\t# bug_entry::flags\n" \
- "\t.org 2b+%c1\n" \
- ".popsection" \
- : : "i" (flags), \
+ asm volatile("ASM_BUG ins=\"" ins "\" file=%c0 line=%c1 " \
+ "flags=%c2 size=%c3" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (flags), \
"i" (sizeof(struct bug_entry))); \
} while (0)
-#endif /* CONFIG_DEBUG_BUGVERBOSE */
-
-#else
-
-#define _BUG_FLAGS(ins, flags) asm volatile(ins)
-
-#endif /* CONFIG_GENERIC_BUG */
-
#define HAVE_ARCH_BUG
#define BUG() \
do { \
@@ -82,4 +46,54 @@ do { \
#include <asm-generic/bug.h>
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_GENERIC_BUG
+
+#ifdef CONFIG_X86_32
+.macro __BUG_REL val:req
+ .long \val
+.endm
+#else
+.macro __BUG_REL val:req
+ .long \val - 2b
+.endm
+#endif
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+.macro ASM_BUG ins:req file:req line:req flags:req size:req
+1: \ins
+ .pushsection __bug_table,"aw"
+2: __BUG_REL val=1b # bug_entry::bug_addr
+ __BUG_REL val=\file # bug_entry::file
+ .word \line # bug_entry::line
+ .word \flags # bug_entry::flags
+ .org 2b+\size
+ .popsection
+.endm
+
+#else /* !CONFIG_DEBUG_BUGVERBOSE */
+
+.macro ASM_BUG ins:req file:req line:req flags:req size:req
+1: \ins
+ .pushsection __bug_table,"aw"
+2: __BUG_REL val=1b # bug_entry::bug_addr
+ .word \flags # bug_entry::flags
+ .org 2b+\size
+ .popsection
+.endm
+
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
+#else /* CONFIG_GENERIC_BUG */
+
+.macro ASM_BUG ins:req file:req line:req flags:req size:req
+ \ins
+.endm
+
+#endif /* CONFIG_GENERIC_BUG */
+
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index e958e28f7ab5..86b63c7feab7 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -3,5 +3,6 @@
#define _ASM_X86_CACHEINFO_H
void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index a55d79b233d3..bfb85e5844ab 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -242,10 +242,12 @@ extern void __add_wrong_size(void)
BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
- asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
- : "=a" (__ret), "+d" (__old2), \
- "+m" (*(p1)), "+m" (*(p2)) \
- : "i" (2 * sizeof(long)), "a" (__old1), \
+ asm volatile(pfx "cmpxchg%c5b %1" \
+ CC_SET(e) \
+ : CC_OUT(e) (__ret), \
+ "+m" (*(p1)), "+m" (*(p2)), \
+ "+a" (__old1), "+d" (__old2) \
+ : "i" (2 * sizeof(long)), \
"b" (__new1), "c" (__new2)); \
__ret; \
})
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 0ce6f452d334..fab4df16a3c4 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -225,6 +225,6 @@ static inline bool in_compat_syscall(void)
struct compat_siginfo;
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
- const siginfo_t *from, bool x32_ABI);
+ const kernel_siginfo_t *from, bool x32_ABI);
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 4a7884b8dca5..29c706415443 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -30,8 +30,6 @@ struct cpu_entry_area {
*/
struct tss_struct tss;
- char entry_trampoline[PAGE_SIZE];
-
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aced6c9290d6..7d442722ef24 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -2,10 +2,10 @@
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H
-#include <asm/processor.h>
-
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
#include <asm/asm.h>
#include <linux/bitops.h>
@@ -161,37 +161,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline __pure bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P[always]\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P[feature]\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
+ asm_volatile_goto("STATIC_CPU_HAS bitnum=%[bitnum] "
+ "cap_byte=\"%[cap_byte]\" "
+ "feature=%P[feature] t_yes=%l[t_yes] "
+ "t_no=%l[t_no] always=%P[always]"
: : [feature] "i" (bit),
[always] "i" (X86_FEATURE_ALWAYS),
[bitnum] "i" (1 << (bit & 7)),
@@ -226,5 +199,44 @@ t_no:
#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
boot_cpu_data.x86_model
-#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+#else /* __ASSEMBLY__ */
+
+.macro STATIC_CPU_HAS bitnum:req cap_byte:req feature:req t_yes:req t_no:req always:req
+1:
+ jmp 6f
+2:
+ .skip -(((5f-4f) - (2b-1b)) > 0) * ((5f-4f) - (2b-1b)),0x90
+3:
+ .section .altinstructions,"a"
+ .long 1b - . /* src offset */
+ .long 4f - . /* repl offset */
+ .word \always /* always replace */
+ .byte 3b - 1b /* src len */
+ .byte 5f - 4f /* repl len */
+ .byte 3b - 2b /* pad len */
+ .previous
+ .section .altinstr_replacement,"ax"
+4:
+ jmp \t_no
+5:
+ .previous
+ .section .altinstructions,"a"
+ .long 1b - . /* src offset */
+ .long 0 /* no replacement */
+ .word \feature /* feature bit */
+ .byte 3b - 1b /* src len */
+ .byte 0 /* repl len */
+ .byte 0 /* pad len */
+ .previous
+ .section .altinstr_aux,"ax"
+6:
+ testb \bitnum,\cap_byte
+ jnz \t_yes
+ jmp \t_no
+ .previous
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 4505ac2735ad..9e5ca30738e5 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -8,7 +8,7 @@
DECLARE_PER_CPU(unsigned long, cpu_dr7);
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
/*
* These special macros can be used to get or set a debugging register
*/
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 13c5ee878a47..68a99d2a5f33 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -108,7 +108,7 @@ static inline int desc_empty(const void *ptr)
return !(desc[0] | desc[1]);
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define load_TR_desc() native_load_tr_desc()
@@ -134,7 +134,7 @@ static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
}
-#endif /* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index cec5fae23eb3..eea40d52ca78 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -140,6 +140,7 @@ extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void);
extern void efi_switch_mm(struct mm_struct *mm);
+extern void efi_recover_from_page_fault(unsigned long phys_addr);
struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 0d157d2a1e2a..69c0f892e310 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -10,6 +10,7 @@
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
+#include <asm/fsgsbase.h>
typedef unsigned long elf_greg_t;
@@ -62,8 +63,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
#define R_X86_64_8 14 /* Direct 8 bit sign extended */
#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
-
-#define R_X86_64_NUM 16
+#define R_X86_64_PC64 24 /* Place relative 64-bit signed */
/*
* These are used to set parameters in the core dumps.
@@ -205,7 +205,6 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
- unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@@ -228,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
- rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
- rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
+ (pr_reg)[21] = x86_fsbase_read_cpu(); \
+ (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index f9c3a5d502f4..d8c2198d543b 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -29,7 +29,8 @@ struct pt_regs;
(b)->handler = (tmp).handler - (delta); \
} while (0)
-extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern int fixup_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, unsigned long fault_addr);
extern int fixup_bug(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e203169931c7..50ba74a34a37 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,6 +14,16 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+/*
+ * Exposed to assembly code for setting up initial page tables. Cannot be
+ * calculated in assembly code (fixmap entries are an enum), but is sanity
+ * checked in the actual fixmap C code to make sure that the fixmap is
+ * covered fully.
+ */
+#define FIXMAP_PMD_NUM 2
+/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
+#define FIXMAP_PMD_TOP 507
+
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
#include <asm/acpi.h>
@@ -152,7 +162,7 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
void native_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
static inline void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a38bf5a1e37a..5f7290e6e954 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -226,7 +226,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
"3: movl $-2,%[err]\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -528,7 +528,7 @@ static inline void fpregs_activate(struct fpu *fpu)
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
- if (old_fpu->initialized) {
+ if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
else
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
new file mode 100644
index 000000000000..eb377b6e9eed
--- /dev/null
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_FSGSBASE_H
+#define _ASM_FSGSBASE_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_64
+
+#include <asm/msr-index.h>
+
+/*
+ * Read/write a task's FSBASE or GSBASE. This returns the value that
+ * the FS/GS base would have (if the task were to be resumed). These
+ * work on the current task or on a non-running (typically stopped
+ * ptrace child) task.
+ */
+extern unsigned long x86_fsbase_read_task(struct task_struct *task);
+extern unsigned long x86_gsbase_read_task(struct task_struct *task);
+extern int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
+extern int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
+
+/* Helper functions for reading/writing FS/GS base */
+
+static inline unsigned long x86_fsbase_read_cpu(void)
+{
+ unsigned long fsbase;
+
+ rdmsrl(MSR_FS_BASE, fsbase);
+
+ return fsbase;
+}
+
+static inline unsigned long x86_gsbase_read_cpu_inactive(void)
+{
+ unsigned long gsbase;
+
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+
+ return gsbase;
+}
+
+extern void x86_fsbase_write_cpu(unsigned long fsbase);
+extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_FSGSBASE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index de4d68852d3a..13c83fe97988 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -20,7 +20,7 @@
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
@@ -36,8 +36,8 @@
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 4b) \
- _ASM_EXTABLE(2b, 4b) \
+ _ASM_EXTABLE_UA(1b, 4b) \
+ _ASM_EXTABLE_UA(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index e977b6b3a538..4139f7650fe5 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -38,6 +38,8 @@
#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
/* Partition reference TSC MSR is available */
#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
+/* Partition Guest IDLE MSR is available */
+#define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10)
/* A partition's reference time stamp counter (TSC) page */
#define HV_X64_MSR_REFERENCE_TSC 0x40000021
@@ -246,6 +248,9 @@
#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+/* Hyper-V guest idle MSR */
+#define HV_X64_MSR_GUEST_IDLE 0x400000F0
+
/* Hyper-V guest crash notification MSR's */
#define HV_X64_MSR_CRASH_P0 0x40000100
#define HV_X64_MSR_CRASH_P1 0x40000101
@@ -726,19 +731,21 @@ struct hv_enlightened_vmcs {
#define HV_STIMER_AUTOENABLE (1ULL << 3)
#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
-struct ipi_arg_non_ex {
- u32 vector;
- u32 reserved;
- u64 cpu_mask;
-};
-
struct hv_vpset {
u64 format;
u64 valid_bank_mask;
u64 bank_contents[];
};
-struct ipi_arg_ex {
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+ u32 vector;
+ u32 reserved;
+ u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
u32 vector;
u32 reserved;
struct hv_vpset vp_set;
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 7ed08a7c3398..0dd6b0f4000e 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -8,9 +8,6 @@
* The "_X" parts are generally the EP and EX Xeons, or the
* "Extreme" ones, like Broadwell-E.
*
- * Things ending in "2" are usually because we have no better
- * name for them. There's no processor called "SILVERMONT2".
- *
* While adding a new CPUID for a new microarchitecture, add a new
* group to keep logically sorted out in chronological order. Within
* that group keep the CPUID for the variants sorted by model number.
@@ -57,19 +54,23 @@
/* "Small Core" Processors (Atom) */
-#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
-#define INTEL_FAM6_ATOM_LINCROFT 0x26
-#define INTEL_FAM6_ATOM_PENWELL 0x27
-#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
-#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
-#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
-#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
-#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
-#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
-#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
-#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
+#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
+#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+
+#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
+#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
+#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+
+#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
+#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
+#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+
+#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
+#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
+
+#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
+#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
+#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..832da8229cc7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
-
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space
@@ -369,18 +370,6 @@ extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
extern bool is_early_ioremap_ptep(pte_t *ptep);
-#ifdef CONFIG_XEN
-#include <xen/xen.h>
-struct bio_vec;
-
-extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
- const struct bio_vec *vec2);
-
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
- (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
- (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-#endif /* CONFIG_XEN */
-
#define IO_SPACE_LIMIT 0xffff
#include <asm-generic/io.h>
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c14f2a74b2be..058e40fed167 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -33,7 +33,8 @@ extern inline unsigned long native_save_fl(void)
return flags;
}
-static inline void native_restore_fl(unsigned long flags)
+extern inline void native_restore_fl(unsigned long flags);
+extern inline void native_restore_fl(unsigned long flags)
{
asm volatile("push %0 ; popf"
: /* no output */
@@ -63,7 +64,7 @@ static inline __cpuidle void native_halt(void)
#endif
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
@@ -122,6 +123,10 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define DISABLE_INTERRUPTS(x) cli
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x) pushfq; popq %rax
+#endif
+
#define SWAPGS swapgs
/*
* Currently paravirt can't handle swapgs nicely when we
@@ -134,8 +139,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
*/
#define SWAPGS_UNSAFE_STACK swapgs
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-
#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
@@ -144,18 +147,12 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
-#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(x) pushfq; popq %rax
-#endif
#else
#define INTERRUPT_RETURN iret
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define GET_CR0_INTO_EAX movl %cr0, %eax
#endif
-
#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
#ifndef __ASSEMBLY__
static inline int arch_irqs_disabled_flags(unsigned long flags)
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 8c0de4282659..a5fb34fe56a4 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -2,19 +2,6 @@
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
-#ifndef HAVE_JUMP_LABEL
-/*
- * For better or for worse, if jump labels (the gcc extension) are missing,
- * then the entire static branch patching infrastructure is compiled out.
- * If that happens, the code in here will malfunction. Raise a compiler
- * error instead.
- *
- * In theory, jump labels and the static branch patching infrastructure
- * could be decoupled to fix this.
- */
-#error asm/jump_label.h included on a non-jump-label kernel
-#endif
-
#define JUMP_LABEL_NOP_SIZE 5
#ifdef CONFIG_X86_64
@@ -33,14 +20,9 @@
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
- asm_volatile_goto("1:"
- ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
- ".popsection \n\t"
- : : "i" (key), "i" (branch) : : l_yes);
-
+ asm_volatile_goto("STATIC_BRANCH_NOP l_yes=\"%l[l_yes]\" key=\"%c0\" "
+ "branch=\"%c1\""
+ : : "i" (key), "i" (branch) : : l_yes);
return false;
l_yes:
return true;
@@ -48,13 +30,8 @@ l_yes:
static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
{
- asm_volatile_goto("1:"
- ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
- "2:\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
- ".popsection \n\t"
+ asm_volatile_goto("STATIC_BRANCH_JMP l_yes=\"%l[l_yes]\" key=\"%c0\" "
+ "branch=\"%c1\""
: : "i" (key), "i" (branch) : : l_yes);
return false;
@@ -62,49 +39,28 @@ l_yes:
return true;
}
-#ifdef CONFIG_X86_64
-typedef u64 jump_label_t;
-#else
-typedef u32 jump_label_t;
-#endif
-
-struct jump_entry {
- jump_label_t code;
- jump_label_t target;
- jump_label_t key;
-};
-
#else /* __ASSEMBLY__ */
-.macro STATIC_JUMP_IF_TRUE target, key, def
-.Lstatic_jump_\@:
- .if \def
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .else
- .byte STATIC_KEY_INIT_NOP
- .endif
+.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req
+.Lstatic_branch_nop_\@:
+ .byte STATIC_KEY_INIT_NOP
+.Lstatic_branch_no_after_\@:
.pushsection __jump_table, "aw"
_ASM_ALIGN
- _ASM_PTR .Lstatic_jump_\@, \target, \key
+ .long .Lstatic_branch_nop_\@ - ., \l_yes - .
+ _ASM_PTR \key + \branch - .
.popsection
.endm
-.macro STATIC_JUMP_IF_FALSE target, key, def
-.Lstatic_jump_\@:
- .if \def
- .byte STATIC_KEY_INIT_NOP
- .else
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .endif
+.macro STATIC_BRANCH_JMP l_yes:req key:req branch:req
+.Lstatic_branch_jmp_\@:
+ .byte 0xe9
+ .long \l_yes - .Lstatic_branch_jmp_after_\@
+.Lstatic_branch_jmp_after_\@:
.pushsection __jump_table, "aw"
_ASM_ALIGN
- _ASM_PTR .Lstatic_jump_\@, \target, \key + 1
+ .long .Lstatic_branch_jmp_\@ - ., \l_yes - .
+ _ASM_PTR \key + \branch - .
.popsection
.endm
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 395c9631e000..75f1e35e7c15 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -22,10 +22,20 @@ enum die_val {
DIE_NMIUNKNOWN,
};
+enum show_regs_mode {
+ SHOW_REGS_SHORT,
+ /*
+ * For when userspace crashed, but we don't think it's our fault, and
+ * therefore don't print kernel registers.
+ */
+ SHOW_REGS_USER,
+ SHOW_REGS_ALL
+};
+
extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
-extern void __show_regs(struct pt_regs *regs, int all);
+extern void __show_regs(struct pt_regs *regs, enum show_regs_mode);
extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index f327236f0fa7..5125fca472bb 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
-#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f82cd91cd3c..93c4bf598fb0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -364,6 +364,10 @@ struct x86_emulate_ctxt {
#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e
+
#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 00ddb0c9e612..09b2e3e2cf1b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -869,6 +869,8 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+
+ bool guest_can_read_msr_platform_info;
};
struct kvm_vm_stat {
@@ -1022,6 +1024,7 @@ struct kvm_x86_ops {
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
@@ -1055,6 +1058,7 @@ struct kvm_x86_ops {
bool (*umip_emulated)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+ void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
@@ -1237,19 +1241,12 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-#define EMULTYPE_RETRY (1 << 3)
-#define EMULTYPE_NO_REEXECUTE (1 << 4)
-#define EMULTYPE_NO_UD_ON_FAIL (1 << 5)
-#define EMULTYPE_VMWARE (1 << 6)
-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
- int emulation_type, void *insn, int insn_len);
-
-static inline int emulate_instruction(struct kvm_vcpu *vcpu,
- int emulation_type)
-{
- return x86_emulate_instruction(vcpu, 0,
- emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0);
-}
+#define EMULTYPE_ALLOW_RETRY (1 << 3)
+#define EMULTYPE_NO_UD_ON_FAIL (1 << 4)
+#define EMULTYPE_VMWARE (1 << 5)
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -1450,7 +1447,6 @@ asmlinkage void kvm_spurious_fault(void);
____kvm_handle_fault_on_reboot(insn, "")
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
@@ -1463,7 +1459,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
- unsigned long ipi_bitmap_high, int min,
+ unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
u64 kvm_get_arch_capabilities(void);
@@ -1490,6 +1486,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
int kvm_is_in_guest(void);
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index c91083c59845..349a47acaa4a 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -53,7 +53,7 @@ static inline void local_sub(long i, local_t *l)
*/
static inline bool local_sub_and_test(long i, local_t *l)
{
- GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e);
+ return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i);
}
/**
@@ -66,7 +66,7 @@ static inline bool local_sub_and_test(long i, local_t *l)
*/
static inline bool local_dec_and_test(local_t *l)
{
- GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e);
+ return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e);
}
/**
@@ -79,7 +79,7 @@ static inline bool local_dec_and_test(local_t *l)
*/
static inline bool local_inc_and_test(local_t *l)
{
- GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e);
+ return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e);
}
/**
@@ -93,7 +93,7 @@ static inline bool local_inc_and_test(local_t *l)
*/
static inline bool local_add_negative(long i, local_t *l)
{
- GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s);
+ return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i);
}
/**
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 3a17107594c8..4da9b1c58d28 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -10,41 +10,44 @@
/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
-#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
-#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
-#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
+#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */
+#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */
+#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT 16
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
-#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
-#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */
-#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */
+#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */
+#define MCG_ELOG_P BIT_ULL(26) /* Extended error log supported */
+#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */
/* MCG_STATUS register defines */
-#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
-#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
-#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
-#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */
+#define MCG_STATUS_RIPV BIT_ULL(0) /* restart ip valid */
+#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */
+#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */
/* MCG_EXT_CTL register defines */
-#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
+#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */
/* MCi_STATUS register defines */
-#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
-#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
-#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
-#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
-#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
-#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
-#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
-#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
-#define MCI_STATUS_AR (1ULL<<55) /* Action required */
+#define MCI_STATUS_VAL BIT_ULL(63) /* valid error */
+#define MCI_STATUS_OVER BIT_ULL(62) /* previous errors lost */
+#define MCI_STATUS_UC BIT_ULL(61) /* uncorrected error */
+#define MCI_STATUS_EN BIT_ULL(60) /* error enabled */
+#define MCI_STATUS_MISCV BIT_ULL(59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV BIT_ULL(58) /* addr reg. valid */
+#define MCI_STATUS_PCC BIT_ULL(57) /* processor context corrupt */
+#define MCI_STATUS_S BIT_ULL(56) /* Signaled machine check */
+#define MCI_STATUS_AR BIT_ULL(55) /* Action required */
+#define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */
+#define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38)
+#define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
/* AMD-specific bits */
-#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
-#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
-#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
-#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
+#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
+#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
/*
* McaX field if set indicates a given bank supports MCA extensions:
@@ -84,7 +87,7 @@
#define MCI_MISC_ADDR_GENERIC 7 /* generic */
/* CTL2 register defines */
-#define MCI_CTL2_CMCI_EN (1ULL << 30)
+#define MCI_CTL2_CMCI_EN BIT_ULL(30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
#define MCJ_CTX_MASK 3
@@ -214,6 +217,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
#endif
+static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
+
int mce_available(struct cpuinfo_x86 *c);
bool mce_is_memory_error(struct mce *m);
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c0643831706e..616f8e637bc3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
+void __init mem_encrypt_free_decrypted_mem(void);
bool sme_active(void);
bool sev_active(void);
+#define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
@@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+#define __bss_decrypted
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
@@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
#define __sme_pa(x) (__pa(x) | sme_me_mask)
#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index eeeb9289c764..0ca50611e8ce 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -16,12 +16,12 @@
extern atomic64_t last_mm_ctx_id;
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
}
-#endif /* !CONFIG_PARAVIRT */
+#endif /* !CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PERF_EVENTS
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index 61eb4b63c5ec..d0b1434fb0b6 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -57,8 +57,14 @@
#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1))
#define MPX_BNDSTA_ERROR_CODE 0x3
+struct mpx_fault_info {
+ void __user *addr;
+ void __user *lower;
+ void __user *upper;
+};
+
#ifdef CONFIG_X86_INTEL_MPX
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
+int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
int mpx_handle_bd_fault(void);
static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
{
@@ -78,9 +84,9 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
unsigned long flags);
#else
-static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+static inline int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
{
- return NULL;
+ return -EINVAL;
}
static inline int mpx_handle_bd_fault(void)
{
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f37704497d8f..0d6271cce198 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -351,6 +351,8 @@ int hyperv_flush_guest_mapping(u64 as);
#ifdef CONFIG_X86_64
void hv_apic_init(void);
+void __init hv_init_spinlocks(void);
+bool hv_vcpu_is_preempted(int vcpu);
#else
static inline void hv_apic_init(void) {}
#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4731f0cf97c5..80f4a4f38c79 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -164,6 +164,7 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 04addd6e0a4a..91e4cf189914 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -242,7 +242,7 @@ static inline unsigned long long native_read_pmc(int counter)
return EAX_EDX_VAL(val, low, high);
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
@@ -305,7 +305,7 @@ do { \
#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
-#endif /* !CONFIG_PARAVIRT */
+#endif /* !CONFIG_PARAVIRT_XXL */
/*
* 64-bit version of wrmsr_safe():
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index fd2a8c1b88bc..80dc14422495 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -170,11 +170,15 @@
*/
# define CALL_NOSPEC \
ANNOTATE_NOSPEC_ALTERNATIVE \
- ALTERNATIVE( \
+ ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
"call __x86_indirect_thunk_%V[thunk_target]\n", \
- X86_FEATURE_RETPOLINE)
+ X86_FEATURE_RETPOLINE, \
+ "lfence;\n" \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
@@ -184,7 +188,8 @@
* here, anyway.
*/
# define CALL_NOSPEC \
- ALTERNATIVE( \
+ ANNOTATE_NOSPEC_ALTERNATIVE \
+ ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
" jmp 904f;\n" \
@@ -199,7 +204,11 @@
" ret;\n" \
" .align 16\n" \
"904: call 901b;\n", \
- X86_FEATURE_RETPOLINE)
+ X86_FEATURE_RETPOLINE, \
+ "lfence;\n" \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#else /* No retpoline for C / inline asm */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 6afac386a434..cd0cf1c568b4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -59,13 +59,16 @@
#endif
/*
- * Kernel image size is limited to 1GiB due to the fixmap living in the
- * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
- * 512MiB by default, leaving 1.5GiB for modules once the page tables
- * are fully set up. If kernel ASLR is configured, it can extend the
- * kernel page table mapping, reducing the size of the modules area.
+ * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
+ * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
+ *
+ * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
+ * page tables are fully set up.
+ *
+ * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
+ * of the modules area to 1.5 GiB.
*/
-#if defined(CONFIG_RANDOMIZE_BASE)
+#ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index e375d4266b53..4bf42f9e4eea 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -17,16 +17,73 @@
#include <linux/cpumask.h>
#include <asm/frame.h>
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return PVOP_CALL0(unsigned long long, time.sched_clock);
+}
+
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return PVOP_CALL1(u64, time.steal_clock, cpu);
+}
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ pv_ops.cpu.io_delay();
+#ifdef REALLY_SLOW_IO
+ pv_ops.cpu.io_delay();
+ pv_ops.cpu.io_delay();
+ pv_ops.cpu.io_delay();
+#endif
+}
+
+static inline void __flush_tlb(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_user);
+}
+
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_kernel);
+}
+
+static inline void __flush_tlb_one_user(unsigned long addr)
+{
+ PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
+}
+
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+}
+
+static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
+}
+
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+{
+ PVOP_VCALL1(mmu.exit_mmap, mm);
+}
+
+#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
- PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
+ PVOP_VCALL1(cpu.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
+ PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
}
/*
@@ -34,98 +91,98 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
*/
static inline unsigned long paravirt_get_debugreg(int reg)
{
- return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
+ return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
- PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
+ PVOP_VCALL2(cpu.set_debugreg, reg, val);
}
static inline unsigned long read_cr0(void)
{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
+ return PVOP_CALL0(unsigned long, cpu.read_cr0);
}
static inline void write_cr0(unsigned long x)
{
- PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
+ PVOP_VCALL1(cpu.write_cr0, x);
}
static inline unsigned long read_cr2(void)
{
- return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
+ return PVOP_CALL0(unsigned long, mmu.read_cr2);
}
static inline void write_cr2(unsigned long x)
{
- PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
+ PVOP_VCALL1(mmu.write_cr2, x);
}
static inline unsigned long __read_cr3(void)
{
- return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
+ return PVOP_CALL0(unsigned long, mmu.read_cr3);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
+ PVOP_VCALL1(mmu.write_cr3, x);
}
static inline void __write_cr4(unsigned long x)
{
- PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
+ PVOP_VCALL1(cpu.write_cr4, x);
}
#ifdef CONFIG_X86_64
static inline unsigned long read_cr8(void)
{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+ return PVOP_CALL0(unsigned long, cpu.read_cr8);
}
static inline void write_cr8(unsigned long x)
{
- PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+ PVOP_VCALL1(cpu.write_cr8, x);
}
#endif
static inline void arch_safe_halt(void)
{
- PVOP_VCALL0(pv_irq_ops.safe_halt);
+ PVOP_VCALL0(irq.safe_halt);
}
static inline void halt(void)
{
- PVOP_VCALL0(pv_irq_ops.halt);
+ PVOP_VCALL0(irq.halt);
}
static inline void wbinvd(void)
{
- PVOP_VCALL0(pv_cpu_ops.wbinvd);
+ PVOP_VCALL0(cpu.wbinvd);
}
#define get_kernel_rpl() (pv_info.kernel_rpl)
static inline u64 paravirt_read_msr(unsigned msr)
{
- return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr);
+ return PVOP_CALL1(u64, cpu.read_msr, msr);
}
static inline void paravirt_write_msr(unsigned msr,
unsigned low, unsigned high)
{
- PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+ PVOP_VCALL3(cpu.write_msr, msr, low, high);
}
static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
{
- return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err);
+ return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
}
static inline int paravirt_write_msr_safe(unsigned msr,
unsigned low, unsigned high)
{
- return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high);
+ return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high);
}
#define rdmsr(msr, val1, val2) \
@@ -170,23 +227,9 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
return err;
}
-static inline unsigned long long paravirt_sched_clock(void)
-{
- return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
-}
-
-struct static_key;
-extern struct static_key paravirt_steal_enabled;
-extern struct static_key paravirt_steal_rq_enabled;
-
-static inline u64 paravirt_steal_clock(int cpu)
-{
- return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
-}
-
static inline unsigned long long paravirt_read_pmc(int counter)
{
- return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
+ return PVOP_CALL1(u64, cpu.read_pmc, counter);
}
#define rdpmc(counter, low, high) \
@@ -200,166 +243,127 @@ do { \
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
+ PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
}
static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
+ PVOP_VCALL2(cpu.free_ldt, ldt, entries);
}
static inline void load_TR_desc(void)
{
- PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
+ PVOP_VCALL0(cpu.load_tr_desc);
}
static inline void load_gdt(const struct desc_ptr *dtr)
{
- PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
+ PVOP_VCALL1(cpu.load_gdt, dtr);
}
static inline void load_idt(const struct desc_ptr *dtr)
{
- PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
+ PVOP_VCALL1(cpu.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+ PVOP_VCALL2(cpu.set_ldt, addr, entries);
}
static inline unsigned long paravirt_store_tr(void)
{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
+ return PVOP_CALL0(unsigned long, cpu.store_tr);
}
+
#define store_tr(tr) ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
- PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
+ PVOP_VCALL2(cpu.load_tls, t, cpu);
}
#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int gs)
{
- PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+ PVOP_VCALL1(cpu.load_gs_index, gs);
}
#endif
static inline void write_ldt_entry(struct desc_struct *dt, int entry,
const void *desc)
{
- PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
+ PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc);
}
static inline void write_gdt_entry(struct desc_struct *dt, int entry,
void *desc, int type)
{
- PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
+ PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type);
}
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
- PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+ PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}
static inline void set_iopl_mask(unsigned mask)
{
- PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
-}
-
-/* The paravirtualized I/O functions */
-static inline void slow_down_io(void)
-{
- pv_cpu_ops.io_delay();
-#ifdef REALLY_SLOW_IO
- pv_cpu_ops.io_delay();
- pv_cpu_ops.io_delay();
- pv_cpu_ops.io_delay();
-#endif
+ PVOP_VCALL1(cpu.set_iopl_mask, mask);
}
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
- PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
+ PVOP_VCALL2(mmu.activate_mm, prev, next);
}
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
- PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
-}
-
-static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
-{
- PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
-}
-
-static inline void __flush_tlb(void)
-{
- PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
-}
-static inline void __flush_tlb_global(void)
-{
- PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
-}
-static inline void __flush_tlb_one_user(unsigned long addr)
-{
- PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr);
-}
-
-static inline void flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{
- PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
-}
-
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table);
+ PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
- return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+ return PVOP_CALL1(int, mmu.pgd_alloc, mm);
}
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+ PVOP_VCALL2(mmu.pgd_free, mm, pgd);
}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pte, mm, pfn);
}
static inline void paravirt_release_pte(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
+ PVOP_VCALL1(mmu.release_pte, pfn);
}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pmd, mm, pfn);
}
static inline void paravirt_release_pmd(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
+ PVOP_VCALL1(mmu.release_pmd, pfn);
}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+ PVOP_VCALL1(mmu.release_pud, pfn);
}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_p4d, mm, pfn);
}
static inline void paravirt_release_p4d(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+ PVOP_VCALL1(mmu.release_p4d, pfn);
}
static inline pte_t __pte(pteval_t val)
@@ -367,13 +371,9 @@ static inline pte_t __pte(pteval_t val)
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pteval_t,
- pv_mmu_ops.make_pte,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pteval_t, mmu.make_pte, val, (u64)val >> 32);
else
- ret = PVOP_CALLEE1(pteval_t,
- pv_mmu_ops.make_pte,
- val);
+ ret = PVOP_CALLEE1(pteval_t, mmu.make_pte, val);
return (pte_t) { .pte = ret };
}
@@ -383,11 +383,10 @@ static inline pteval_t pte_val(pte_t pte)
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
+ ret = PVOP_CALLEE2(pteval_t, mmu.pte_val,
pte.pte, (u64)pte.pte >> 32);
else
- ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
- pte.pte);
+ ret = PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte);
return ret;
}
@@ -397,11 +396,9 @@ static inline pgd_t __pgd(pgdval_t val)
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pgdval_t, mmu.make_pgd, val, (u64)val >> 32);
else
- ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
- val);
+ ret = PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val);
return (pgd_t) { ret };
}
@@ -411,11 +408,10 @@ static inline pgdval_t pgd_val(pgd_t pgd)
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
+ ret = PVOP_CALLEE2(pgdval_t, mmu.pgd_val,
pgd.pgd, (u64)pgd.pgd >> 32);
else
- ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
- pgd.pgd);
+ ret = PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd);
return ret;
}
@@ -426,8 +422,7 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long a
{
pteval_t ret;
- ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
- mm, addr, ptep);
+ ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep);
return (pte_t) { .pte = ret };
}
@@ -437,20 +432,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long a
{
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
- pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte);
else
- PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
+ PVOP_VCALL4(mmu.ptep_modify_prot_commit,
mm, addr, ptep, pte.pte);
}
static inline void set_pte(pte_t *ptep, pte_t pte)
{
if (sizeof(pteval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
- pte.pte, (u64)pte.pte >> 32);
+ PVOP_VCALL3(mmu.set_pte, ptep, pte.pte, (u64)pte.pte >> 32);
else
- PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
- pte.pte);
+ PVOP_VCALL2(mmu.set_pte, ptep, pte.pte);
}
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -458,9 +451,9 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
{
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
- pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
+ pv_ops.mmu.set_pte_at(mm, addr, ptep, pte);
else
- PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
+ PVOP_VCALL4(mmu.set_pte_at, mm, addr, ptep, pte.pte);
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
@@ -468,9 +461,9 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
pmdval_t val = native_pmd_val(pmd);
if (sizeof(pmdval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
+ PVOP_VCALL3(mmu.set_pmd, pmdp, val, (u64)val >> 32);
else
- PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
+ PVOP_VCALL2(mmu.set_pmd, pmdp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 3
@@ -479,11 +472,9 @@ static inline pmd_t __pmd(pmdval_t val)
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pmdval_t, mmu.make_pmd, val, (u64)val >> 32);
else
- ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
- val);
+ ret = PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val);
return (pmd_t) { ret };
}
@@ -493,11 +484,10 @@ static inline pmdval_t pmd_val(pmd_t pmd)
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
+ ret = PVOP_CALLEE2(pmdval_t, mmu.pmd_val,
pmd.pmd, (u64)pmd.pmd >> 32);
else
- ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
- pmd.pmd);
+ ret = PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd);
return ret;
}
@@ -507,39 +497,23 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
pudval_t val = native_pud_val(pud);
if (sizeof(pudval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
- val, (u64)val >> 32);
+ PVOP_VCALL3(mmu.set_pud, pudp, val, (u64)val >> 32);
else
- PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
- val);
+ PVOP_VCALL2(mmu.set_pud, pudp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
- val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
- val);
+ ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- pudval_t ret;
-
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
- pud.pud, (u64)pud.pud >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
- pud.pud);
-
- return ret;
+ return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
}
static inline void pud_clear(pud_t *pudp)
@@ -551,31 +525,26 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
p4dval_t val = native_p4d_val(p4d);
- if (sizeof(p4dval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
- val, (u64)val >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
- val);
+ PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 5
static inline p4d_t __p4d(p4dval_t val)
{
- p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);
+ p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
- return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
+ return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
+ PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd));
}
#define set_pgd(pgdp, pgdval) do { \
@@ -606,19 +575,18 @@ static inline void p4d_clear(p4d_t *p4dp)
64-bit pte atomically */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
- PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
- pte.pte, pte.pte >> 32);
+ PVOP_VCALL3(mmu.set_pte_atomic, ptep, pte.pte, pte.pte >> 32);
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+ PVOP_VCALL3(mmu.pte_clear, mm, addr, ptep);
}
static inline void pmd_clear(pmd_t *pmdp)
{
- PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+ PVOP_VCALL1(mmu.pmd_clear, pmdp);
}
#else /* !CONFIG_X86_PAE */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -641,64 +609,68 @@ static inline void pmd_clear(pmd_t *pmdp)
#define __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
- PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
+ PVOP_VCALL1(cpu.start_context_switch, prev);
}
static inline void arch_end_context_switch(struct task_struct *next)
{
- PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
+ PVOP_VCALL1(cpu.end_context_switch, next);
}
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
- PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
+ PVOP_VCALL0(mmu.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(void)
{
- PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
+ PVOP_VCALL0(mmu.lazy_mode.leave);
}
static inline void arch_flush_lazy_mmu_mode(void)
{
- PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush);
+ PVOP_VCALL0(mmu.lazy_mode.flush);
}
static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
{
- pv_mmu_ops.set_fixmap(idx, phys, flags);
+ pv_ops.mmu.set_fixmap(idx, phys, flags);
}
+#endif
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
{
- PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+ PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
}
static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
- PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+ PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
{
- PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+ PVOP_VCALL2(lock.wait, ptr, val);
}
static __always_inline void pv_kick(int cpu)
{
- PVOP_VCALL1(pv_lock_ops.kick, cpu);
+ PVOP_VCALL1(lock.kick, cpu);
}
static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
- return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+ return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
}
+void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
+bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
+
#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
@@ -778,24 +750,25 @@ static __always_inline bool pv_vcpu_is_preempted(long cpu)
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
+#ifdef CONFIG_PARAVIRT_XXL
static inline notrace unsigned long arch_local_save_flags(void)
{
- return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
+ return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
static inline notrace void arch_local_irq_restore(unsigned long f)
{
- PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
+ PVOP_VCALLEE1(irq.restore_fl, f);
}
static inline notrace void arch_local_irq_disable(void)
{
- PVOP_VCALLEE0(pv_irq_ops.irq_disable);
+ PVOP_VCALLEE0(irq.irq_disable);
}
static inline notrace void arch_local_irq_enable(void)
{
- PVOP_VCALLEE0(pv_irq_ops.irq_enable);
+ PVOP_VCALLEE0(irq.irq_enable);
}
static inline notrace unsigned long arch_local_irq_save(void)
@@ -806,6 +779,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
arch_local_irq_disable();
return f;
}
+#endif
/* Make sure as little as possible of this mess escapes. */
@@ -827,7 +801,7 @@ extern void default_banner(void);
#else /* __ASSEMBLY__ */
-#define _PVSITE(ptype, clobbers, ops, word, algn) \
+#define _PVSITE(ptype, ops, word, algn) \
771:; \
ops; \
772:; \
@@ -836,7 +810,6 @@ extern void default_banner(void);
word 771b; \
.byte ptype; \
.byte 772b-771b; \
- .short clobbers; \
.popsection
@@ -868,8 +841,8 @@ extern void default_banner(void);
COND_POP(set, CLBR_RCX, rcx); \
COND_POP(set, CLBR_RAX, rax)
-#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
-#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+#define PARA_PATCH(off) ((off) / 8)
+#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
#else
#define PV_SAVE_REGS(set) \
@@ -883,46 +856,41 @@ extern void default_banner(void);
COND_POP(set, CLBR_EDI, edi); \
COND_POP(set, CLBR_EAX, eax)
-#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
-#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
+#define PARA_PATCH(off) ((off) / 4)
+#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4)
#define PARA_INDIRECT(addr) *%cs:addr
#endif
+#ifdef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);)
+ PARA_SITE(PARA_PATCH(PV_CPU_iret), \
+ ANNOTATE_RETPOLINE_SAFE; \
+ jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)
#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
+ PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
+ PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
-#ifdef CONFIG_X86_32
-#define GET_CR0_INTO_EAX \
- push %ecx; push %edx; \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
- pop %edx; pop %ecx
-#else /* !CONFIG_X86_32 */
-
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT_XXL
/*
* If swapgs is used while the userspace stack is still current,
* there's no way to call a pvop. The PV replacement *must* be
* inlined, or the swapgs instruction must be trapped and emulated.
*/
#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
- swapgs)
+ PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
/*
* Note: swapgs is very special, and in practise is either going to be
@@ -931,44 +899,51 @@ extern void default_banner(void);
* it.
*/
#define SWAPGS \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
+ PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
)
+#endif
#define GET_CR2_INTO_RAX \
ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);
+ call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
+#ifdef CONFIG_PARAVIRT_XXL
#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
- CLBR_NONE, \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);)
+ PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
+ ANNOTATE_RETPOLINE_SAFE; \
+ jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
+#endif
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
+#endif /* !CONFIG_PARAVIRT */
+
#ifndef __ASSEMBLY__
+#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
}
+#endif
+#ifndef CONFIG_PARAVIRT
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
+#endif
#endif /* __ASSEMBLY__ */
-#endif /* !CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 4b75acc23b30..fba54ca23b2a 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -66,12 +66,14 @@ struct paravirt_callee_save {
/* general info */
struct pv_info {
+#ifdef CONFIG_PARAVIRT_XXL
unsigned int kernel_rpl;
int shared_kernel_pmd;
#ifdef CONFIG_X86_64
u16 extra_user_64bit_cs; /* __USER_CS if none */
#endif
+#endif
const char *name;
};
@@ -85,17 +87,18 @@ struct pv_init_ops {
* the number of bytes of code generated, as we nop pad the
* rest in generic code.
*/
- unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
+ unsigned (*patch)(u8 type, void *insnbuf,
unsigned long addr, unsigned len);
} __no_randomize_layout;
-
+#ifdef CONFIG_PARAVIRT_XXL
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
void (*enter)(void);
void (*leave)(void);
void (*flush)(void);
} __no_randomize_layout;
+#endif
struct pv_time_ops {
unsigned long long (*sched_clock)(void);
@@ -104,6 +107,9 @@ struct pv_time_ops {
struct pv_cpu_ops {
/* hooks for various privileged instructions */
+ void (*io_delay)(void);
+
+#ifdef CONFIG_PARAVIRT_XXL
unsigned long (*get_debugreg)(int regno);
void (*set_debugreg)(int regno, unsigned long value);
@@ -141,7 +147,6 @@ struct pv_cpu_ops {
void (*set_iopl_mask)(unsigned mask);
void (*wbinvd)(void);
- void (*io_delay)(void);
/* cpuid emulation, mostly so that caps bits can be disabled */
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
@@ -176,9 +181,11 @@ struct pv_cpu_ops {
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
+#endif
} __no_randomize_layout;
struct pv_irq_ops {
+#ifdef CONFIG_PARAVIRT_XXL
/*
* Get/set interrupt state. save_fl and restore_fl are only
* expected to use X86_EFLAGS_IF; all other bits
@@ -195,35 +202,34 @@ struct pv_irq_ops {
void (*safe_halt)(void);
void (*halt)(void);
-
+#endif
} __no_randomize_layout;
struct pv_mmu_ops {
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_one_user)(unsigned long addr);
+ void (*flush_tlb_others)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+ void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+
+ /* Hook for intercepting the destruction of an mm_struct. */
+ void (*exit_mmap)(struct mm_struct *mm);
+
+#ifdef CONFIG_PARAVIRT_XXL
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
unsigned long (*read_cr3)(void);
void (*write_cr3)(unsigned long);
- /*
- * Hooks for intercepting the creation/use/destruction of an
- * mm_struct.
- */
+ /* Hooks for intercepting the creation/use of an mm_struct. */
void (*activate_mm)(struct mm_struct *prev,
struct mm_struct *next);
void (*dup_mmap)(struct mm_struct *oldmm,
struct mm_struct *mm);
- void (*exit_mmap)(struct mm_struct *mm);
-
-
- /* TLB operations */
- void (*flush_tlb_user)(void);
- void (*flush_tlb_kernel)(void);
- void (*flush_tlb_one_user)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
- const struct flush_tlb_info *info);
-
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
@@ -298,6 +304,7 @@ struct pv_mmu_ops {
an mfn. We can tell which is which from the index. */
void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags);
+#endif
} __no_randomize_layout;
struct arch_spinlock;
@@ -321,48 +328,31 @@ struct pv_lock_ops {
* number for each function using the offset which we use to indicate
* what to patch. */
struct paravirt_patch_template {
- struct pv_init_ops pv_init_ops;
- struct pv_time_ops pv_time_ops;
- struct pv_cpu_ops pv_cpu_ops;
- struct pv_irq_ops pv_irq_ops;
- struct pv_mmu_ops pv_mmu_ops;
- struct pv_lock_ops pv_lock_ops;
+ struct pv_init_ops init;
+ struct pv_time_ops time;
+ struct pv_cpu_ops cpu;
+ struct pv_irq_ops irq;
+ struct pv_mmu_ops mmu;
+ struct pv_lock_ops lock;
} __no_randomize_layout;
extern struct pv_info pv_info;
-extern struct pv_init_ops pv_init_ops;
-extern struct pv_time_ops pv_time_ops;
-extern struct pv_cpu_ops pv_cpu_ops;
-extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_mmu_ops pv_mmu_ops;
-extern struct pv_lock_ops pv_lock_ops;
+extern struct paravirt_patch_template pv_ops;
#define PARAVIRT_PATCH(x) \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "i" (&(op))
+ [paravirt_opptr] "i" (&(pv_ops.op))
#define paravirt_clobber(clobber) \
[paravirt_clobber] "i" (clobber)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type, clobber) \
- "771:\n\t" insn_string "\n" "772:\n" \
- ".pushsection .parainstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR " 771b\n" \
- " .byte " type "\n" \
- " .byte 772b-771b\n" \
- " .short " clobber "\n" \
- ".popsection\n"
-
/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+#define paravirt_call \
+ "PARAVIRT_CALL type=\"%c[paravirt_typenum]\"" \
+ " clobber=\"%c[paravirt_clobber]\"" \
+ " pv_opptr=\"%c[paravirt_opptr]\";"
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -373,34 +363,17 @@ extern struct pv_lock_ops pv_lock_ops;
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
-unsigned paravirt_patch_call(void *insnbuf,
- const void *target, u16 tgt_clobbers,
- unsigned long addr, u16 site_clobbers,
- unsigned len);
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
- unsigned long addr, unsigned len);
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+unsigned paravirt_patch_default(u8 type, void *insnbuf,
unsigned long addr, unsigned len);
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
const char *start, const char *end);
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len);
+unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len);
int paravirt_disable_iospace(void);
/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
-#define PARAVIRT_CALL \
- ANNOTATE_RETPOLINE_SAFE \
- "call *%c[paravirt_opptr];"
-
-/*
* These macros are intended to wrap calls through one of the paravirt
* ops structs, so that they can be later identified and patched at
* runtime.
@@ -510,9 +483,9 @@ int paravirt_disable_iospace(void);
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_PARAVIRT_DEBUG
-#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
+#define PVOP_TEST_NULL(op) BUG_ON(pv_ops.op == NULL)
#else
-#define PVOP_TEST_NULL(op) ((void)op)
+#define PVOP_TEST_NULL(op) ((void)pv_ops.op)
#endif
#define PVOP_RETMASK(rettype) \
@@ -537,7 +510,7 @@ int paravirt_disable_iospace(void);
/* since this condition will never hold */ \
if (sizeof(rettype) > sizeof(unsigned long)) { \
asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
+ paravirt_call \
post \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
@@ -547,7 +520,7 @@ int paravirt_disable_iospace(void);
__ret = (rettype)((((u64)__edx) << 32) | __eax); \
} else { \
asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
+ paravirt_call \
post \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
@@ -574,7 +547,7 @@ int paravirt_disable_iospace(void);
PVOP_VCALL_ARGS; \
PVOP_TEST_NULL(op); \
asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
+ paravirt_call \
post \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
@@ -688,12 +661,31 @@ struct paravirt_patch_site {
u8 *instr; /* original instructions */
u8 instrtype; /* type of this instruction */
u8 len; /* length of original instruction */
- u16 clobbers; /* what registers you may clobber */
};
extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
+#else /* __ASSEMBLY__ */
+
+/*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+.macro PARAVIRT_CALL type:req clobber:req pv_opptr:req
+771: ANNOTATE_RETPOLINE_SAFE
+ call *\pv_opptr
+772: .pushsection .parainstructions,"a"
+ _ASM_ALIGN
+ _ASM_PTR 771b
+ .byte \type
+ .byte 772b-771b
+ .short \clobber
+ .popsection
+.endm
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do { \
typeof(var) pfo_ret__; \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_arg(1)",%0" \
+ asm volatile(op "b "__percpu_arg(1)",%0"\
: "=q" (pfo_ret__) \
: "m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_arg(1)",%0" \
+ asm volatile(op "w "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_arg(1)",%0" \
+ asm volatile(op "l "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 8: \
- asm(op "q "__percpu_arg(1)",%0" \
+ asm volatile(op "q "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 12f54082f4c8..8bdf74902293 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -46,6 +46,14 @@
#define INTEL_ARCH_EVENT_MASK \
(ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+#define AMD64_L3_SLICE_SHIFT 48
+#define AMD64_L3_SLICE_MASK \
+ ((0xFULL) << AMD64_L3_SLICE_SHIFT)
+
+#define AMD64_L3_THREAD_SHIFT 56
+#define AMD64_L3_THREAD_MASK \
+ ((0xFFULL) << AMD64_L3_THREAD_SHIFT)
+
#define X86_RAW_EVENT_MASK \
(ARCH_PERFMON_EVENTSEL_EVENT | \
ARCH_PERFMON_EVENTSEL_UMASK | \
@@ -270,6 +278,7 @@ struct perf_guest_switch_msr {
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
+extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index fbd578daa66e..ec7f43327033 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -8,7 +8,7 @@
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 24c6cf5f16b7..60d0f9015317 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -19,9 +19,6 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd);
-#endif
*pmdp = pmd;
}
@@ -61,9 +58,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0));
-#endif
return __pmd(xchg((pmdval_t *)xp, 0));
}
#else
@@ -73,9 +67,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
#ifdef CONFIG_SMP
static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0));
-#endif
return __pud(xchg((pudval_t *)xp, 0));
}
#else
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index a564084c6141..f8b1ad2c3828 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
#define _ASM_X86_PGTABLE_3LEVEL_H
+#include <asm/atomic64_32.h>
+
/*
* Intel Physical Address Extension (PAE) Mode - three-level page
* tables on PPro+ CPUs.
@@ -150,10 +152,7 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
pte_t res;
- /* xchg acts as a barrier before the setting of the high bits */
- res.pte_low = xchg(&ptep->pte_low, 0);
- res.pte_high = ptep->pte_high;
- ptep->pte_high = 0;
+ res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0);
return res;
}
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 858358a82b14..33845d36897c 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -20,7 +20,7 @@ typedef union {
} pte_t;
#endif /* !__ASSEMBLY__ */
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \
(pv_info.shared_kernel_pmd)))
#else
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e4ffa565a69f..40616e805292 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -55,9 +55,9 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
extern pmdval_t early_pmd_flags;
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
+#else /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
@@ -112,8 +112,7 @@ extern pmdval_t early_pmd_flags;
#define __pte(x) native_make_pte(x)
#define arch_end_context_switch(prev) do {} while(0)
-
-#endif /* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
/*
* The following only work if pte_present() is true.
@@ -1195,7 +1194,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
return xchg(pmdp, pmd);
} else {
pmd_t old = *pmdp;
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
return old;
}
}
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f773d5e6c8cc..9c85b54bf03c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,6 +14,7 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
+#include <asm/fixmap.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
@@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
-extern pte_t level1_fixmap_pgt[512];
+extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_top_pgt
@@ -55,15 +56,15 @@ struct mm_struct;
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
-static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- *ptep = native_make_pte(0);
+ WRITE_ONCE(*ptep, pte);
}
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
{
- *ptep = pte;
+ native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -73,7 +74,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
@@ -109,7 +110,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- *pudp = pud;
+ WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -137,13 +138,13 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
pgd_t pgd;
if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
- *p4dp = p4d;
+ WRITE_ONCE(*p4dp, p4d);
return;
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
- *p4dp = native_make_p4d(native_pgd_val(pgd));
+ WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -153,7 +154,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pti_set_user_pgtbl(pgdp, pgd);
+ WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b64acb08a62b..106b7d0e2dae 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -124,7 +124,7 @@
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY)
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 7f2dbd91fc74..90cb2f36c042 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -88,7 +88,7 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
+ return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c24297268ebc..617805981cce 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -132,6 +132,8 @@ struct cpuinfo_x86 {
/* Index into per_cpu list: */
u16 cpu_index;
u32 microcode;
+ /* Address space bits used by the cache internally */
+ u8 x86_cache_bits;
unsigned initialized : 1;
} __randomize_layout;
@@ -153,7 +155,8 @@ enum cpuid_regs_idx {
#define X86_VENDOR_CENTAUR 5
#define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
+#define X86_VENDOR_HYGON 9
+#define X86_VENDOR_NUM 10
#define X86_VENDOR_UNKNOWN 0xff
@@ -183,7 +186,7 @@ extern void cpu_detect(struct cpuinfo_x86 *c);
static inline unsigned long long l1tf_pfn_limit(void)
{
- return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT);
+ return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
}
extern void early_cpu_init(void);
@@ -313,7 +316,13 @@ struct x86_hw_tss {
*/
u64 sp1;
+ /*
+ * Since Linux does not use ring 2, the 'sp2' slot is unused by
+ * hardware. entry_SYSCALL_64 uses it as scratch space to stash
+ * the user RSP value.
+ */
u64 sp2;
+
u64 reserved2;
u64 ist[7];
u32 reserved3;
@@ -576,7 +585,7 @@ static inline bool on_thread_stack(void)
current_stack_pointer) < THREAD_SIZE;
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define __cpuid native_cpuid
@@ -587,7 +596,7 @@ static inline void load_sp0(unsigned long sp0)
}
#define set_iopl_mask native_set_iopl_mask
-#endif /* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6de1fd3d0097..143c99499531 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -37,8 +37,10 @@ struct pt_regs {
unsigned short __esh;
unsigned short fs;
unsigned short __fsh;
+ /* On interrupt, gs and __gsh store the vector number. */
unsigned short gs;
unsigned short __gsh;
+ /* On interrupt, this is the error code. */
unsigned long orig_ax;
unsigned long ip;
unsigned short cs;
@@ -144,7 +146,7 @@ static inline int v8086_mode(struct pt_regs *regs)
static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
/*
* On non-paravirt systems, this is the only long mode CPL 3
* selector. We do not allow long mode selectors in the LDT.
@@ -237,23 +239,51 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs,
}
/**
+ * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
+ * kernel stack which is specified by @regs. If the @n th entry is NOT in
+ * the kernel stack, this returns NULL.
+ */
+static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return addr;
+ else
+ return NULL;
+}
+
+/* To avoid include hell, we can't include uaccess.h */
+extern long probe_kernel_read(void *dst, const void *src, size_t size);
+
+/**
* regs_get_kernel_stack_nth() - get Nth entry of the stack
* @regs: pt_regs which contains kernel stack pointer.
* @n: stack entry number.
*
* regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
- * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack
* this returns 0.
*/
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
unsigned int n)
{
- unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
- addr += n;
- if (regs_within_kernel_stack(regs, (unsigned long)addr))
- return *addr;
- else
- return 0;
+ unsigned long *addr;
+ unsigned long val;
+ long ret;
+
+ addr = regs_get_kernel_stack_nth_addr(regs, n);
+ if (addr) {
+ ret = probe_kernel_read(&val, addr, sizeof(val));
+ if (!ret)
+ return val;
+ }
+ return 0;
}
#define arch_has_single_step() (1)
@@ -263,7 +293,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
#endif
-#define ARCH_HAS_USER_SINGLE_STEP_INFO
+#define ARCH_HAS_USER_SINGLE_STEP_REPORT
/*
* When hitting ptrace_stop(), we cannot return using SYSRET because
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 3e70bed8a978..87623c6b13db 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,24 @@
#include <asm/cpufeature.h>
#include <asm-generic/qspinlock_types.h>
#include <asm/paravirt.h>
+#include <asm/rmwcc.h>
#define _Q_PENDING_LOOPS (1 << 9)
+#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ u32 val = 0;
+
+ if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
+ "I", _Q_PENDING_OFFSET))
+ val |= _Q_PENDING_VAL;
+
+ val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+ return val;
+}
+
#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_init_lock_hash(void);
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
index 19b90521954c..a8b5e1e13319 100644
--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -4,6 +4,41 @@
* x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
* PaX/grsecurity.
*/
+
+#ifdef __ASSEMBLY__
+
+#include <asm/asm.h>
+#include <asm/bug.h>
+
+.macro REFCOUNT_EXCEPTION counter:req
+ .pushsection .text..refcount
+111: lea \counter, %_ASM_CX
+112: ud2
+ ASM_UNREACHABLE
+ .popsection
+113: _ASM_EXTABLE_REFCOUNT(112b, 113b)
+.endm
+
+/* Trigger refcount exception if refcount result is negative. */
+.macro REFCOUNT_CHECK_LT_ZERO counter:req
+ js 111f
+ REFCOUNT_EXCEPTION counter="\counter"
+.endm
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+.macro REFCOUNT_CHECK_LE_ZERO counter:req
+ jz 111f
+ REFCOUNT_CHECK_LT_ZERO counter="\counter"
+.endm
+
+/* Trigger refcount exception unconditionally. */
+.macro REFCOUNT_ERROR counter:req
+ jmp 111f
+ REFCOUNT_EXCEPTION counter="\counter"
+.endm
+
+#else /* __ASSEMBLY__ */
+
#include <linux/refcount.h>
#include <asm/bug.h>
@@ -15,34 +50,11 @@
* central refcount exception. The fixup address for the exception points
* back to the regular execution flow in .text.
*/
-#define _REFCOUNT_EXCEPTION \
- ".pushsection .text..refcount\n" \
- "111:\tlea %[counter], %%" _ASM_CX "\n" \
- "112:\t" ASM_UD2 "\n" \
- ASM_UNREACHABLE \
- ".popsection\n" \
- "113:\n" \
- _ASM_EXTABLE_REFCOUNT(112b, 113b)
-
-/* Trigger refcount exception if refcount result is negative. */
-#define REFCOUNT_CHECK_LT_ZERO \
- "js 111f\n\t" \
- _REFCOUNT_EXCEPTION
-
-/* Trigger refcount exception if refcount result is zero or negative. */
-#define REFCOUNT_CHECK_LE_ZERO \
- "jz 111f\n\t" \
- REFCOUNT_CHECK_LT_ZERO
-
-/* Trigger refcount exception unconditionally. */
-#define REFCOUNT_ERROR \
- "jmp 111f\n\t" \
- _REFCOUNT_EXCEPTION
static __always_inline void refcount_add(unsigned int i, refcount_t *r)
{
asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
- REFCOUNT_CHECK_LT_ZERO
+ "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\""
: [counter] "+m" (r->refs.counter)
: "ir" (i)
: "cc", "cx");
@@ -51,7 +63,7 @@ static __always_inline void refcount_add(unsigned int i, refcount_t *r)
static __always_inline void refcount_inc(refcount_t *r)
{
asm volatile(LOCK_PREFIX "incl %0\n\t"
- REFCOUNT_CHECK_LT_ZERO
+ "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\""
: [counter] "+m" (r->refs.counter)
: : "cc", "cx");
}
@@ -59,7 +71,7 @@ static __always_inline void refcount_inc(refcount_t *r)
static __always_inline void refcount_dec(refcount_t *r)
{
asm volatile(LOCK_PREFIX "decl %0\n\t"
- REFCOUNT_CHECK_LE_ZERO
+ "REFCOUNT_CHECK_LE_ZERO counter=\"%[counter]\""
: [counter] "+m" (r->refs.counter)
: : "cc", "cx");
}
@@ -67,14 +79,17 @@ static __always_inline void refcount_dec(refcount_t *r)
static __always_inline __must_check
bool refcount_sub_and_test(unsigned int i, refcount_t *r)
{
- GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
- r->refs.counter, "er", i, "%0", e, "cx");
+
+ return GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
+ "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"",
+ r->refs.counter, e, "er", i, "cx");
}
static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
- GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
- r->refs.counter, "%0", e, "cx");
+ return GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
+ "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"",
+ r->refs.counter, e, "cx");
}
static __always_inline __must_check
@@ -91,7 +106,7 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
/* Did we try to increment from/to an undesirable state? */
if (unlikely(c < 0 || c == INT_MAX || result < c)) {
- asm volatile(REFCOUNT_ERROR
+ asm volatile("REFCOUNT_ERROR counter=\"%[counter]\""
: : [counter] "m" (r->refs.counter)
: "cc", "cx");
break;
@@ -107,4 +122,6 @@ static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
return refcount_add_not_zero(1, r);
}
+#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 4914a3e7c803..46ac84b506f5 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -2,56 +2,69 @@
#ifndef _ASM_X86_RMWcc
#define _ASM_X86_RMWcc
+/* This counts to 12. Any more, it will return 13th argument. */
+#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
+#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+#define __RMWcc_CONCAT(a, b) a ## b
+#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b)
+
#define __CLOBBERS_MEM(clb...) "memory", ## clb
#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
/* Use asm goto */
-#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
-do { \
+#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
+({ \
+ bool c = false; \
asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
- : : [counter] "m" (var), ## __VA_ARGS__ \
+ : : [var] "m" (_var), ## __VA_ARGS__ \
: clobbers : cc_label); \
- return 0; \
-cc_label: \
- return 1; \
-} while (0)
-
-#define __BINARY_RMWcc_ARG " %1, "
-
+ if (0) { \
+cc_label: c = true; \
+ } \
+ c; \
+})
#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
/* Use flags output or a set instruction */
-#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
-do { \
+#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
+({ \
bool c; \
asm volatile (fullop CC_SET(cc) \
- : [counter] "+m" (var), CC_OUT(cc) (c) \
+ : [var] "+m" (_var), CC_OUT(cc) (c) \
: __VA_ARGS__ : clobbers); \
- return c; \
-} while (0)
-
-#define __BINARY_RMWcc_ARG " %2, "
+ c; \
+})
#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
+#define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \
__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())
-#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\
- __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
- __CLOBBERS_MEM(clobbers))
+#define GEN_UNARY_RMWcc_3(op, var, cc) \
+ GEN_UNARY_RMWcc_4(op, var, cc, "%[var]")
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
- __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
- __CLOBBERS_MEM(), vcon (val))
+#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X)
+
+#define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \
+ __GEN_RMWcc(op " %[val], " arg0, var, cc, \
+ __CLOBBERS_MEM(), [val] vcon (_val))
+
+#define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \
+ GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]")
+
+#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \
+ __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers))
-#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc, \
- clobbers...) \
- __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
- __CLOBBERS_MEM(clobbers), vcon (val))
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, cc, vcon, _val, clobbers...)\
+ __GEN_RMWcc(op " %[val], %[var]\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers), [val] vcon (_val))
#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 4a911a382ade..8ea1cfdbeabc 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -11,7 +11,6 @@ extern char __end_rodata_aligned[];
#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
-extern char __entry_trampoline_start[], __entry_trampoline_end[];
#endif
#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index e293c122d0d5..ac3892920419 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -186,8 +186,7 @@
#define GDT_ENTRY_TLS_MIN 12
#define GDT_ENTRY_TLS_MAX 14
-/* Abused to load per CPU data from limit */
-#define GDT_ENTRY_PER_CPU 15
+#define GDT_ENTRY_CPUNODE 15
/*
* Number of entries in the GDT table:
@@ -207,11 +206,11 @@
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER32_DS __USER_DS
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
-#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
+#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
#endif
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
# define get_kernel_rpl() 0
#endif
@@ -225,6 +224,47 @@
#define GDT_ENTRY_TLS_ENTRIES 3
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
+#ifdef CONFIG_X86_64
+
+/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
+#define VDSO_CPUNODE_BITS 12
+#define VDSO_CPUNODE_MASK 0xfff
+
+#ifndef __ASSEMBLY__
+
+/* Helper functions to store/load CPU and node numbers */
+
+static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node)
+{
+ return (node << VDSO_CPUNODE_BITS) | cpu;
+}
+
+static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
+{
+ unsigned int p;
+
+ /*
+ * Load CPU and node number from the GDT. LSL is faster than RDTSCP
+ * and works on all CPUs. This is volatile so that it orders
+ * correctly with respect to barrier() and to keep GCC from cleverly
+ * hoisting it out of the calling function.
+ *
+ * If RDPID is available, use it.
+ */
+ alternative_io ("lsl %[seg],%[p]",
+ ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+ X86_FEATURE_RDPID,
+ [p] "=a" (p), [seg] "r" (__CPUNODE_SEG));
+
+ if (cpu)
+ *cpu = (p & VDSO_CPUNODE_MASK);
+ if (node)
+ *node = (p >> VDSO_CPUNODE_BITS);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_X86_64 */
+
#ifdef __KERNEL__
/*
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 5f9012ff52ed..33d3c88a7225 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -39,6 +39,7 @@ extern void do_signal(struct pt_regs *regs);
#define __ARCH_HAS_SA_RESTORER
+#include <asm/asm.h>
#include <uapi/asm/sigcontext.h>
#ifdef __i386__
@@ -86,9 +87,9 @@ static inline int __const_sigismember(sigset_t *set, int _sig)
static inline int __gen_sigismember(sigset_t *set, int _sig)
{
- unsigned char ret;
- asm("btl %2,%1\n\tsetc %0"
- : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
+ bool ret;
+ asm("btl %2,%1" CC_SET(c)
+ : CC_OUT(c) (ret) : "m"(*set), "Ir"(_sig-1));
return ret;
}
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 317fc59b512c..43c029cdc3fe 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -141,7 +141,7 @@ static inline unsigned long __read_cr4(void)
return native_read_cr4();
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
@@ -208,7 +208,7 @@ static inline void load_gs_index(unsigned selector)
#endif
-#endif/* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
static inline void clflush(volatile void *__p)
{
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index b6dc698f992a..f335aad404a4 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -111,6 +111,6 @@ static inline unsigned long caller_frame_pointer(void)
return (unsigned long)frame;
}
-void show_opcodes(u8 *rip, const char *loglvl);
+void show_opcodes(struct pt_regs *regs, const char *loglvl);
void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index d33f92b9fa22..7ad41bfcc16c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
index ecffe81ff65c..a892494ca5e4 100644
--- a/arch/x86/include/asm/suspend.h
+++ b/arch/x86/include/asm/suspend.h
@@ -4,3 +4,11 @@
#else
# include <asm/suspend_64.h>
#endif
+extern unsigned long restore_jump_address __visible;
+extern unsigned long jump_address_phys;
+extern unsigned long restore_cr3 __visible;
+extern unsigned long temp_pgt __visible;
+extern unsigned long relocated_restore_code __visible;
+extern int relocate_restore_code(void);
+/* Defined in hibernate_asm_32/64.S */
+extern asmlinkage __visible int restore_image(void);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 8be6afb58471..fdbd9d7b7bca 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -32,4 +32,8 @@ struct saved_context {
unsigned long return_address;
} __attribute__((packed));
+/* routines for saving/restoring kernel state */
+extern char core_restore_code[];
+extern char restore_registers[];
+
#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index cb0a1f470980..404b8b1d44f5 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb) \
-{ \
- if (!tlb->fullmm && !tlb->need_flush_all) \
- flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
- else \
- flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
-}
+static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+ unsigned long start = 0UL, end = TLB_FLUSH_ALL;
+ unsigned int stride_shift = tlb_get_unmap_shift(tlb);
+
+ if (!tlb->fullmm && !tlb->need_flush_all) {
+ start = tlb->start;
+ end = tlb->end;
+ }
+
+ flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+}
+
/*
* While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..323a313947e0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
- /*
- * If we have PCID, then switching to init_mm is reasonably
- * fast. If we don't have PCID, then switching to init_mm is
- * quite slow, so we try to defer it in the hopes that we can
- * avoid it entirely. The latter approach runs the risk of
- * receiving otherwise unnecessary IPIs.
- *
- * This choice is just a heuristic. The tlb code can handle this
- * function returning true or false regardless of whether we have
- * PCID.
- */
- return !static_cpu_has(X86_FEATURE_PCID);
-}
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -175,8 +159,16 @@ struct tlb_state {
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
*/
struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
u16 loaded_mm_asid;
u16 next_asid;
/* last user mm's ctx id */
@@ -246,6 +238,38 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
@@ -507,23 +531,30 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
+ unsigned int stride_shift;
+ bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
-#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+#define flush_tlb_mm(mm) \
+ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
-#define flush_tlb_range(vma, start, end) \
- flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
+#define flush_tlb_range(vma, start, end) \
+ flush_tlb_mm_range((vma)->vm_mm, start, end, \
+ ((vma)->vm_flags & VM_HUGETLB) \
+ ? huge_page_shift(hstate_vma(vma)) \
+ : PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag);
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
- flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
index 7bd92db09e8d..54133017267c 100644
--- a/arch/x86/include/asm/trace/mpx.h
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -11,12 +11,12 @@
TRACE_EVENT(mpx_bounds_register_exception,
- TP_PROTO(void *addr_referenced,
+ TP_PROTO(void __user *addr_referenced,
const struct mpx_bndreg *bndreg),
TP_ARGS(addr_referenced, bndreg),
TP_STRUCT__entry(
- __field(void *, addr_referenced)
+ __field(void __user *, addr_referenced)
__field(u64, lower_bound)
__field(u64, upper_bound)
),
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index aae77eb8491c..b5e58cc0c5e7 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -198,8 +198,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
"4: movl %3,%0\n" \
" jmp 3b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 4b) \
- _ASM_EXTABLE(2b, 4b) \
+ _ASM_EXTABLE_UA(1b, 4b) \
+ _ASM_EXTABLE_UA(2b, 4b) \
: "=r" (err) \
: "A" (x), "r" (addr), "i" (errret), "0" (err))
@@ -340,8 +340,8 @@ do { \
" xorl %%edx,%%edx\n" \
" jmp 3b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 4b) \
- _ASM_EXTABLE(2b, 4b) \
+ _ASM_EXTABLE_UA(1b, 4b) \
+ _ASM_EXTABLE_UA(2b, 4b) \
: "=r" (retval), "=&A"(x) \
: "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \
"i" (errret), "0" (retval)); \
@@ -386,7 +386,7 @@ do { \
" xor"itype" %"rtype"1,%"rtype"1\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
@@ -398,7 +398,7 @@ do { \
"3: mov %3,%0\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
@@ -474,7 +474,7 @@ struct __large_struct { unsigned long buf[100]; };
"3: mov %3,%0\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "=r"(err) \
: ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
@@ -602,7 +602,7 @@ extern void __cmpxchg_wrong_size(void)
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "q" (__new), "1" (__old) \
: "memory" \
@@ -618,7 +618,7 @@ extern void __cmpxchg_wrong_size(void)
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
@@ -634,7 +634,7 @@ extern void __cmpxchg_wrong_size(void)
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
@@ -653,7 +653,7 @@ extern void __cmpxchg_wrong_size(void)
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
"\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_UA(1b, 3b) \
: "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
: "i" (-EFAULT), "r" (__new), "1" (__old) \
: "memory" \
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index a80c0673798f..e60c45fd3679 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -10,8 +10,13 @@ struct cpumask;
struct mm_struct;
#ifdef CONFIG_X86_UV
+#include <linux/efi.h>
extern enum uv_system_type get_uv_system_type(void);
+static inline bool is_early_uv_system(void)
+{
+ return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab);
+}
extern int is_uv_system(void);
extern int is_uv_hubless(void);
extern void uv_cpu_init(void);
@@ -23,6 +28,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
#else /* X86_UV */
static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
+static inline bool is_early_uv_system(void) { return 0; }
static inline int is_uv_system(void) { return 0; }
static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index fb856c9f0449..913a133f8e6f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -5,33 +5,46 @@
#include <linux/compiler.h>
#include <linux/clocksource.h>
+#include <uapi/linux/time.h>
+
#ifdef BUILD_VDSO32_64
typedef u64 gtod_long_t;
#else
typedef unsigned long gtod_long_t;
#endif
+
+/*
+ * There is one of these objects in the vvar page for each
+ * vDSO-accelerated clockid. For high-resolution clocks, this encodes
+ * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse
+ * clocks, this encodes the actual time.
+ *
+ * To confuse the reader, for high-resolution clocks, nsec is left-shifted
+ * by vsyscall_gtod_data.shift.
+ */
+struct vgtod_ts {
+ u64 sec;
+ u64 nsec;
+};
+
+#define VGTOD_BASES (CLOCK_TAI + 1)
+#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI))
+#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE))
+
/*
* vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
* so be carefull by modifying this structure.
*/
struct vsyscall_gtod_data {
- unsigned seq;
-
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
-
- /* open coded 'struct timespec' */
- u64 wall_time_snsec;
- gtod_long_t wall_time_sec;
- gtod_long_t monotonic_time_sec;
- u64 monotonic_time_snsec;
- gtod_long_t wall_time_coarse_sec;
- gtod_long_t wall_time_coarse_nsec;
- gtod_long_t monotonic_time_coarse_sec;
- gtod_long_t monotonic_time_coarse_nsec;
+ unsigned int seq;
+
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+
+ struct vgtod_ts basetime[VGTOD_BASES];
int tz_minuteswest;
int tz_dsttime;
@@ -44,9 +57,9 @@ static inline bool vclock_was_used(int vclock)
return READ_ONCE(vclocks_used) & (1 << vclock);
}
-static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
+static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s)
{
- unsigned ret;
+ unsigned int ret;
repeat:
ret = READ_ONCE(s->seq);
@@ -59,7 +72,7 @@ repeat:
}
static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
- unsigned start)
+ unsigned int start)
{
smp_rmb();
return unlikely(s->seq != start);
@@ -77,30 +90,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s)
++s->seq;
}
-#ifdef CONFIG_X86_64
-
-#define VGETCPU_CPU_MASK 0xfff
-
-static inline unsigned int __getcpu(void)
-{
- unsigned int p;
-
- /*
- * Load per CPU data from GDT. LSL is faster than RDTSCP and
- * works on all CPUs. This is volatile so that it orders
- * correctly wrt barrier() and to keep gcc from cleverly
- * hoisting it out of the calling function.
- *
- * If RDPID is available, use it.
- */
- alternative_io ("lsl %[p],%[seg]",
- ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
- X86_FEATURE_RDPID,
- [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
-
- return p;
-}
-
-#endif /* CONFIG_X86_64 */
-
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 0116b2ee9e64..e05e0d309244 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -83,9 +83,10 @@ static inline void cpu_emergency_vmxoff(void)
*/
static inline int cpu_has_svm(const char **msg)
{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) {
if (msg)
- *msg = "not amd";
+ *msg = "not amd or hygon";
return 0;
}
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b85a7c54c6a1..0f842104862c 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -303,4 +303,6 @@ extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
extern bool x86_pnpbios_disabled(void);
+void x86_verify_bootdata_version(void);
+
#endif
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index d383140e1dc8..068d9b067c83 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_XEN_EVENTS_H
#define _ASM_X86_XEN_EVENTS_H
+#include <xen/xen.h>
+
enum ipi_vector {
XEN_RESCHEDULE_VECTOR,
XEN_CALL_FUNCTION_VECTOR,
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index a06cbf019744..22f89d040ddd 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -16,6 +16,9 @@
#define RAMDISK_PROMPT_FLAG 0x8000
#define RAMDISK_LOAD_FLAG 0x4000
+/* version flags */
+#define VERSION_WRITTEN 0x8000
+
/* loadflags */
#define LOADED_HIGH (1<<0)
#define KASLR_FLAG (1<<1)
@@ -86,6 +89,7 @@ struct setup_header {
__u64 pref_address;
__u32 init_size;
__u32 handover_offset;
+ __u64 acpi_rsdp_addr;
} __attribute__((packed));
struct sys_desc_table {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 86299efa804a..fd23d5778ea1 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -377,6 +377,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
diff --git a/arch/x86/include/uapi/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h
index b3d157957177..6642d8be40c4 100644
--- a/arch/x86/include/uapi/asm/siginfo.h
+++ b/arch/x86/include/uapi/asm/siginfo.h
@@ -7,8 +7,6 @@
typedef long long __kernel_si_clock_t __attribute__((aligned(4)));
# define __ARCH_SI_CLOCK_T __kernel_si_clock_t
# define __ARCH_SI_ATTRIBUTES __attribute__((aligned(8)))
-# else /* x86-64 */
-# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
# endif
#endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607d581b..e8fea7ffa306 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -48,6 +48,7 @@
#include <asm/mpspec.h>
#include <asm/smp.h>
#include <asm/i8259.h>
+#include <asm/setup.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -1771,3 +1772,8 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
e820__range_add(addr, size, E820_TYPE_ACPI);
e820__update_table_print();
}
+
+u64 x86_default_get_root_pointer(void)
+{
+ return boot_params.hdr.acpi_rsdp_addr;
+}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 014f214da581..ebeac487a20c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -222,6 +222,10 @@ void __init arch_init_ideal_nops(void)
}
break;
+ case X86_VENDOR_HYGON:
+ ideal_nops = p6_nops;
+ return;
+
case X86_VENDOR_AMD:
if (boot_cpu_data.x86 > 0xf) {
ideal_nops = p6_nops;
@@ -594,7 +598,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insnbuf, p->instr, p->len);
- used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+ used = pv_ops.init.patch(p->instrtype, insnbuf,
(unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
@@ -684,8 +688,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
- *
- * Note: Must be called under text_mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
@@ -700,6 +702,8 @@ void *text_poke(void *addr, const void *opcode, size_t len)
*/
BUG_ON(!after_bootmem);
+ lockdep_assert_held(&text_mutex);
+
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -782,8 +786,6 @@ int poke_int3_handler(struct pt_regs *regs)
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - sync cores
- *
- * Note: must be called under text_mutex.
*/
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
@@ -792,6 +794,9 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
+
+ lockdep_assert_held(&text_mutex);
+
/*
* Corresponding read barrier in int3 notifier for making sure the
* in_progress and handler are correctly ordered wrt. patching.
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index f299d8a479bb..3f9d1b4019bb 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -482,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
{
void *vaddr;
- vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
+ vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs);
if (!vaddr ||
!force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
return vaddr;
@@ -494,7 +494,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
goto out_free;
return vaddr;
out_free:
- dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
+ dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs);
return NULL;
}
@@ -504,7 +504,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, unsigned long attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
- dma_direct_free(dev, size, vaddr, dma_addr, attrs);
+ dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index b481b95bd8f6..a6eca647bc76 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -61,6 +61,21 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{}
};
+static const struct pci_device_id hygon_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ {}
+};
+
+const struct pci_device_id hygon_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ {}
+};
+
+static const struct pci_device_id hygon_nb_link_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ {}
+};
+
const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
{ 0x00, 0x18, 0x20 },
{ 0xff, 0x00, 0x20 },
@@ -194,15 +209,24 @@ EXPORT_SYMBOL_GPL(amd_df_indirect_read);
int amd_cache_northbridges(void)
{
- u16 i = 0;
- struct amd_northbridge *nb;
+ const struct pci_device_id *misc_ids = amd_nb_misc_ids;
+ const struct pci_device_id *link_ids = amd_nb_link_ids;
+ const struct pci_device_id *root_ids = amd_root_ids;
struct pci_dev *root, *misc, *link;
+ struct amd_northbridge *nb;
+ u16 i = 0;
if (amd_northbridges.num)
return 0;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ root_ids = hygon_root_ids;
+ misc_ids = hygon_nb_misc_ids;
+ link_ids = hygon_nb_link_ids;
+ }
+
misc = NULL;
- while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ while ((misc = next_northbridge(misc, misc_ids)) != NULL)
i++;
if (!i)
@@ -218,11 +242,11 @@ int amd_cache_northbridges(void)
link = misc = root = NULL;
for (i = 0; i != amd_northbridges.num; i++) {
node_to_amd_nb(i)->root = root =
- next_northbridge(root, amd_root_ids);
+ next_northbridge(root, root_ids);
node_to_amd_nb(i)->misc = misc =
- next_northbridge(misc, amd_nb_misc_ids);
+ next_northbridge(misc, misc_ids);
node_to_amd_nb(i)->link = link =
- next_northbridge(link, amd_nb_link_ids);
+ next_northbridge(link, link_ids);
}
if (amd_gart_present())
@@ -261,11 +285,19 @@ EXPORT_SYMBOL_GPL(amd_cache_northbridges);
*/
bool __init early_is_amd_nb(u32 device)
{
+ const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *id;
u32 vendor = device & 0xffff;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ misc_ids = hygon_nb_misc_ids;
+
device >>= 16;
- for (id = amd_nb_misc_ids; id->vendor; id++)
+ for (id = misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return true;
return false;
@@ -277,7 +309,8 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
u64 base, msr;
unsigned int segn_busn_bits;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return NULL;
/* assume all cpus from fam10h have mmconfig */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 84132eddb5a8..ab731ab09f06 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -224,6 +224,11 @@ static int modern_apic(void)
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 >= 0xf)
return 1;
+
+ /* Hygon systems use modern APIC */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return 1;
+
return lapic_get_version() >= 0x14;
}
@@ -1912,6 +1917,8 @@ static int __init detect_init_APIC(void)
(boot_cpu_data.x86 >= 15))
break;
goto no_apic;
+ case X86_VENDOR_HYGON:
+ break;
case X86_VENDOR_INTEL:
if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
(boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 02e8acb134f8..47ff2976c292 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -185,6 +185,7 @@ void __init default_setup_apic_routing(void)
break;
}
/* If P4 and above fall through */
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
def_to_bigsmp = 1;
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 9f148e3d45b4..652e7ffa9b9d 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -313,14 +313,13 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
struct apic_chip_data *apicd = apic_chip_data(irqd);
int vector, cpu;
- cpumask_and(vector_searchmask, vector_searchmask, affmsk);
- cpu = cpumask_first(vector_searchmask);
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
+ cpumask_and(vector_searchmask, dest, affmsk);
+
/* set_affinity might call here for nothing */
if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
return 0;
- vector = irq_matrix_alloc_managed(vector_matrix, cpu);
+ vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask,
+ &cpu);
trace_vector_alloc_managed(irqd->irq, vector, vector);
if (vector < 0)
return vector;
@@ -413,7 +412,7 @@ static int activate_managed(struct irq_data *irqd)
if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
/* Something in the core code broke! Survive gracefully */
pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
- return EINVAL;
+ return -EINVAL;
}
ret = assign_managed_vector(irqd, vector_searchmask);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index ec00d1ff5098..f7151cd03cb0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1640,6 +1640,7 @@ static int do_open(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
static int proc_apm_show(struct seq_file *m, void *v)
{
unsigned short bx;
@@ -1719,6 +1720,7 @@ static int proc_apm_show(struct seq_file *m, void *v)
units);
return 0;
}
+#endif
static int apm(void *unused)
{
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 01de31db300d..72adf6c335dc 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -64,15 +64,12 @@ void common(void) {
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
#endif
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
BLANK();
- OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
- OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+ OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
+ OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
+ OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
+ OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
#endif
#ifdef CONFIG_XEN
@@ -99,13 +96,12 @@ void common(void) {
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
/* Layout info for cpu_entry_area */
- OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
- OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
- /* Offset for sp0 and sp1 into the tss_struct */
+ /* Offset for fields in tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+ OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 3b9405e7ba2b..ddced33184b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -21,10 +21,13 @@ static char syscalls_ia32[] = {
int main(void)
{
#ifdef CONFIG_PARAVIRT
- OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
- OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_PARAVIRT_XXL
+ OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
+ cpu.usergs_sysret64);
+ OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
- OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+ OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
+#endif
#endif
BLANK();
#endif
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 33399426793e..1979a76bfadd 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/kthread.h>
@@ -31,11 +34,17 @@ static __init int set_corruption_check(char *arg)
ssize_t ret;
unsigned long val;
+ if (!arg) {
+ pr_err("memory_corruption_check config string not provided\n");
+ return -EINVAL;
+ }
+
ret = kstrtoul(arg, 10, &val);
if (ret)
return ret;
memory_corruption_check = val;
+
return 0;
}
early_param("memory_corruption_check", set_corruption_check);
@@ -45,6 +54,11 @@ static __init int set_corruption_check_period(char *arg)
ssize_t ret;
unsigned long val;
+ if (!arg) {
+ pr_err("memory_corruption_check_period config string not provided\n");
+ return -EINVAL;
+ }
+
ret = kstrtoul(arg, 10, &val);
if (ret)
return ret;
@@ -59,6 +73,11 @@ static __init int set_corruption_check_size(char *arg)
char *end;
unsigned size;
+ if (!arg) {
+ pr_err("memory_corruption_check_size config string not provided\n");
+ return -EINVAL;
+ }
+
size = memparse(arg, &end);
if (*end == '\0')
@@ -113,7 +132,7 @@ void __init setup_bios_corruption_check(void)
}
if (num_scan_areas)
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
+ pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas);
}
@@ -132,8 +151,7 @@ void check_for_bios_corruption(void)
for (; size; addr++, size -= sizeof(unsigned long)) {
if (!*addr)
continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
+ pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr);
corruption = 1;
*addr = 0;
}
@@ -157,11 +175,11 @@ static int start_periodic_check_for_corruption(void)
if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
return 0;
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+ pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period);
/* First time we run the checks right away */
schedule_delayed_work(&bios_check_work, 0);
+
return 0;
}
device_initcall(start_periodic_check_for_corruption);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 347137e80bf5..1f5d2291c31e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22ab408177b2..eeea634bee0a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -922,7 +922,7 @@ static void init_amd(struct cpuinfo_x86 *c)
static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
- if ((c->x86 == 6)) {
+ if (c->x86 == 6) {
/* Duron Rev A0 */
if (c->x86_model == 3 && c->x86_stepping == 0)
size = 64;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 4c2313d0b9ca..c37e66e493bf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -35,12 +35,10 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
-/*
- * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
- * writes to SPEC_CTRL contain whatever reserved bits have been set.
- */
-u64 __ro_after_init x86_spec_ctrl_base;
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+u64 x86_spec_ctrl_base;
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+static DEFINE_MUTEX(spec_ctrl_mutex);
/*
* The vendor and possibly platform specific bits which can be modified in
@@ -312,6 +310,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
return SPECTRE_V2_CMD_AUTO;
@@ -325,6 +324,46 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
+static bool stibp_needed(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ return false;
+
+ return true;
+}
+
+static void update_stibp_msr(void *info)
+{
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+
+void arch_smt_update(void)
+{
+ u64 mask;
+
+ if (!stibp_needed())
+ return;
+
+ mutex_lock(&spec_ctrl_mutex);
+ mask = x86_spec_ctrl_base;
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ mask |= SPEC_CTRL_STIBP;
+ else
+ mask &= ~SPEC_CTRL_STIBP;
+
+ if (mask != x86_spec_ctrl_base) {
+ pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
+ cpu_smt_control == CPU_SMT_ENABLED ?
+ "Enabling" : "Disabling");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+ }
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -371,7 +410,8 @@ static void __init spectre_v2_select_mitigation(void)
return;
retpoline_auto:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
retpoline_amd:
if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
@@ -424,6 +464,9 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
+
+ /* Enable STIBP if appropriate */
+ arch_smt_update();
}
#undef pr_fmt
@@ -668,6 +711,45 @@ EXPORT_SYMBOL_GPL(l1tf_mitigation);
enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+/*
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
+ *
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
+ */
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
+
+ switch (c->x86_model) {
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
static void __init l1tf_select_mitigation(void)
{
u64 half_pa;
@@ -675,6 +757,8 @@ static void __init l1tf_select_mitigation(void)
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
+ override_cache_bits(&boot_cpu_data);
+
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
@@ -694,11 +778,6 @@ static void __init l1tf_select_mitigation(void)
return;
#endif
- /*
- * This is extremely unlikely to happen because almost all
- * systems have far more MAX_PA/2 than RAM can be fit into
- * DIMM slots.
- */
half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
@@ -778,6 +857,8 @@ static ssize_t l1tf_show_state(char *buf)
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
+ int ret;
+
if (!boot_cpu_has_bug(bug))
return sprintf(buf, "Not affected\n");
@@ -795,10 +876,13 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
spectre_v2_module_string());
+ return ret;
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 0c5fcbd998cf..dc1b9342e9c4 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -602,6 +602,10 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
else
amd_cpuid4(index, &eax, &ebx, &ecx);
amd_init_l3_cache(this_leaf, index);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ cpuid_count(0x8000001d, index, &eax.full,
+ &ebx.full, &ecx.full, &edx);
+ amd_init_l3_cache(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
@@ -625,7 +629,8 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
union _cpuid4_leaf_eax cache_eax;
int i = -1;
- if (c->x86_vendor == X86_VENDOR_AMD)
+ if (c->x86_vendor == X86_VENDOR_AMD ||
+ c->x86_vendor == X86_VENDOR_HYGON)
op = 0x8000001d;
else
op = 4;
@@ -678,6 +683,22 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
}
}
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (!cpuid_edx(0x80000006))
+ return;
+
+ /*
+ * LLC is at the core complex level.
+ * Core complex ID is ApicId[3] for these processors.
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+}
+
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
@@ -691,6 +712,11 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)
}
}
+void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
+{
+ num_cache_leaves = find_num_cache_leaves(c);
+}
+
void init_intel_cacheinfo(struct cpuinfo_x86 *c)
{
/* Cache sizes */
@@ -913,7 +939,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86_vendor == X86_VENDOR_AMD ||
+ c->x86_vendor == X86_VENDOR_HYGON) {
if (__cache_amd_cpumap_setup(cpu, index, base))
return;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 84dee5ab745a..660d0b22e962 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -919,6 +919,7 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
#endif
+ c->x86_cache_bits = c->x86_phys_bits;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -948,11 +949,11 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
}
static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY },
{ X86_VENDOR_CENTAUR, 5 },
{ X86_VENDOR_INTEL, 5 },
{ X86_VENDOR_NSC, 5 },
@@ -962,15 +963,16 @@ static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
{ X86_VENDOR_AMD },
+ { X86_VENDOR_HYGON },
{}
};
/* Only list CPUs which speculate but are non susceptible to SSB */
static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
@@ -983,14 +985,14 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
/* in addition to cpu_no_speculation */
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
{}
@@ -1075,6 +1077,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
memset(&c->x86_capability, 0, sizeof c->x86_capability);
c->extended_cpuid_level = 0;
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
+
/* cyrix could have cpuid enabled via c_identify()*/
if (have_cpuid_p()) {
cpu_detect(c);
@@ -1092,7 +1097,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
} else {
- identify_cpu_without_cpuid(c);
setup_clear_cpu_cap(X86_FEATURE_CPUID);
}
@@ -1239,10 +1243,10 @@ static void generic_identify(struct cpuinfo_x86 *c)
* ESPFIX issue, we can change this.
*/
#ifdef CONFIG_X86_32
-# ifdef CONFIG_PARAVIRT
+# ifdef CONFIG_PARAVIRT_XXL
do {
extern void native_iret(void);
- if (pv_cpu_ops.iret == native_iret)
+ if (pv_ops.cpu.iret == native_iret)
set_cpu_bug(c, X86_BUG_ESPFIX);
} while (0);
# else
@@ -1530,19 +1534,8 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- extern char _entry_trampoline[];
- extern char entry_SYSCALL_64_trampoline[];
-
- int cpu = smp_processor_id();
- unsigned long SYSCALL64_entry_trampoline =
- (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
- (entry_SYSCALL_64_trampoline - _entry_trampoline);
-
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- if (static_cpu_has(X86_FEATURE_PTI))
- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
- else
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1553,7 +1546,8 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1668,6 +1662,29 @@ static void wait_for_master_cpu(int cpu)
#endif
}
+#ifdef CONFIG_X86_64
+static void setup_getcpu(int cpu)
+{
+ unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
+ struct desc_struct d = { };
+
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ write_rdtscp_aux(cpudata);
+
+ /* Store CPU and node number in limit. */
+ d.limit0 = cpudata;
+ d.limit1 = cpudata >> 16;
+
+ d.type = 5; /* RO data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
+}
+#endif
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -1705,6 +1722,7 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
+ setup_getcpu(cpu);
me = current;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 7b229afa0a37..da5446acc241 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -54,6 +54,7 @@ extern u32 get_scattered_cpuid_leaf(unsigned int level,
enum cpuid_regs_idx reg);
extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 8949b7ae6d92..d12226f60168 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -437,7 +437,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
new file mode 100644
index 000000000000..cf25405444ab
--- /dev/null
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Hygon Processor Support for Linux
+ *
+ * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd.
+ *
+ * Author: Pu Wen <puwen@hygon.cn>
+ */
+#include <linux/io.h>
+
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/cacheinfo.h>
+#include <asm/spec-ctrl.h>
+#include <asm/delay.h>
+#ifdef CONFIG_X86_64
+# include <asm/set_memory.h>
+#endif
+
+#include "cpu.h"
+
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+static void hygon_get_topology_early(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+}
+
+/*
+ * Fixup core topology information for
+ * (1) Hygon multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) Hygon processors supporting compute units
+ */
+static void hygon_get_topology(struct cpuinfo_x86 *c)
+{
+ u8 node_id;
+ int cpu = smp_processor_id();
+
+ /* get information required for multi-node processors */
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int err;
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+
+ node_id = ecx & 0xff;
+
+ c->cpu_core_id = ebx & 0xff;
+
+ if (smp_num_siblings > 1)
+ c->x86_max_cores /= smp_num_siblings;
+
+ /*
+ * In case leaf B is available, use it to derive
+ * topology information.
+ */
+ err = detect_extended_topology(c);
+ if (!err)
+ c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+ cacheinfo_hygon_init_llc_id(c, cpu, node_id);
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else
+ return;
+
+ if (nodes_per_socket > 1)
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+}
+
+/*
+ * On Hygon setup the lower bits of the APIC id distinguish the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void hygon_detect_cmp(struct cpuinfo_x86 *c)
+{
+ unsigned int bits;
+ int cpu = smp_processor_id();
+
+ bits = c->x86_coreid_bits;
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+ /* use socket ID also for last level cache */
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+}
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned int apicid = c->apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = per_cpu(cpu_llc_id, cpu);
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs.
+ * Assume they are all increased by a constant offset, but
+ * in the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void early_init_hygon_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+#endif
+}
+
+static void bsp_init_hygon(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ pr_debug("tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+#endif
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ u32 ecx;
+
+ ecx = cpuid_ecx(0x8000001e);
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes_per_socket = ((value >> 3) & 7) + 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
+ }
+ }
+}
+
+static void early_init_hygon(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+ early_init_hygon_mc(c);
+
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for Hygon APIC So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ hygon_get_topology_early(c);
+}
+
+static void init_hygon(struct cpuinfo_x86 *c)
+{
+ early_init_hygon(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ cpu_detect_cache_sizes(c);
+
+ hygon_detect_cmp(c);
+ hygon_get_topology(c);
+ srat_detect_node(c);
+
+ init_hygon_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ unsigned long long val;
+ int ret;
+
+ /*
+ * A serializing LFENCE has less overhead than MFENCE, so
+ * use it for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_F10H_DECFG,
+ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
+ /*
+ * Verify that the MSR write was successful (could be running
+ * under a hypervisor) and only then assume that LFENCE is
+ * serializing.
+ */
+ ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ } else {
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ }
+ }
+
+ /*
+ * Hygon processors have APIC timer running in deep C states.
+ */
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_has(c, X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+}
+
+static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+}
+
+static const struct cpu_dev hygon_cpu_dev = {
+ .c_vendor = "Hygon",
+ .c_ident = { "HygonGenuine" },
+ .c_early_init = early_init_hygon,
+ .c_detect_tlb = cpu_detect_tlb_hygon,
+ .c_bsp_init = bsp_init_hygon,
+ .c_init = init_hygon,
+ .c_x86_vendor = X86_VENDOR_HYGON,
+};
+
+cpu_dev_register(hygon_cpu_dev);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 401e8c133108..fc3c07fe7df5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -150,6 +150,9 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
+ if (c->x86 != 6)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
if (c->x86_model == spectre_bad_microcodes[i].model &&
c->x86_stepping == spectre_bad_microcodes[i].stepping)
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index abb71ac70443..44272b7107ad 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -485,9 +485,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
size_t tsize;
if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
- sizeof(unsigned long),
- GFP_KERNEL);
+ d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
@@ -496,7 +494,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
tsize = sizeof(*d->mbm_total);
d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
if (!d->mbm_total) {
- kfree(d->rmid_busy_llc);
+ bitmap_free(d->rmid_busy_llc);
return -ENOMEM;
}
}
@@ -504,7 +502,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
tsize = sizeof(*d->mbm_local);
d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
if (!d->mbm_local) {
- kfree(d->rmid_busy_llc);
+ bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
return -ENOMEM;
}
@@ -610,9 +608,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cancel_delayed_work(&d->cqm_limbo);
}
+ /*
+ * rdt_domain "d" is going to be freed below, so clear
+ * its pointer from pseudo_lock_region struct.
+ */
+ if (d->plr)
+ d->plr->d = NULL;
+
kfree(d->ctrl_val);
kfree(d->mbps_val);
- kfree(d->rmid_busy_llc);
+ bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
kfree(d);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4e588f36228f..3736f6dc9545 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e)
e <= QOS_L3_MBM_LOCAL_EVENT_ID);
}
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
/**
* struct rdt_resource - attributes of an RDT resource
* @rid: The index of the resource
@@ -423,16 +428,19 @@ struct rdt_resource {
struct rdt_cache cache;
struct rdt_membw membw;
const char *format_str;
- int (*parse_ctrlval) (void *data, struct rdt_resource *r,
- struct rdt_domain *d);
+ int (*parse_ctrlval)(struct rdt_parse_data *data,
+ struct rdt_resource *r,
+ struct rdt_domain *d);
struct list_head evt_list;
int num_rmid;
unsigned int mon_scale;
unsigned long fflags;
};
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -521,14 +529,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
- u32 _cbm, int closid, bool exclusive);
+ unsigned long cbm, int closid, bool exclusive);
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
- u32 cbm);
+ unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
int rdt_pseudo_lock_init(void);
void rdt_pseudo_lock_release(void);
@@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int update_domains(struct rdt_resource *r, int closid);
+int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index af358ca05160..27937458c231 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- unsigned long data;
- char *buf = _buf;
+ unsigned long bw_val;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
}
- if (!bw_validate(buf, &data, r))
+ if (!bw_validate(data->buf, &bw_val, r))
return -EINVAL;
- d->new_ctrl = data;
+ d->new_ctrl = bw_val;
d->have_new_ctrl = true;
return 0;
@@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
-struct rdt_cbm_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- struct rdt_cbm_parse_data *data = _data;
struct rdtgroup *rdtgrp = data->rdtgrp;
u32 cbm_val;
@@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
static int parse_line(char *line, struct rdt_resource *r,
struct rdtgroup *rdtgrp)
{
- struct rdt_cbm_parse_data data;
+ struct rdt_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
unsigned long dom_id;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ r->rid == RDT_RESOURCE_MBA) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
next:
if (!line || line[0] == '\0')
return 0;
@@ -403,8 +404,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
for_each_alloc_enabled_rdt_resource(r)
seq_printf(s, "%s:uninitialized\n", r->name);
} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name,
- rdtgrp->plr->d->id, rdtgrp->plr->cbm);
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%s:%d=%x\n",
+ rdtgrp->plr->r->name,
+ rdtgrp->plr->d->id,
+ rdtgrp->plr->cbm);
+ }
} else {
closid = rdtgrp->closid;
for_each_alloc_enabled_rdt_resource(r) {
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
index 40f3903ae5d9..815b4e92522c 100644
--- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -17,6 +17,7 @@
#include <linux/debugfs.h>
#include <linux/kthread.h>
#include <linux/mman.h>
+#include <linux/perf_event.h>
#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -26,6 +27,7 @@
#include <asm/intel_rdt_sched.h>
#include <asm/perf_event.h>
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
#include "intel_rdt.h"
#define CREATE_TRACE_POINTS
@@ -91,7 +93,7 @@ static u64 get_prefetch_disable_bits(void)
*/
return 0xF;
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void)
return 0;
}
-/*
- * Helper to write 64bit value to MSR without tracing. Used when
- * use of the cache should be restricted and use of registers used
- * for local variables avoided.
- */
-static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
-{
- __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
-}
-
/**
* pseudo_lock_minor_get - Obtain available minor number
* @minor: Pointer to where new minor number will be stored
@@ -797,25 +789,27 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
/**
* rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
* @d: RDT domain
- * @_cbm: CBM to test
+ * @cbm: CBM to test
*
- * @d represents a cache instance and @_cbm a capacity bitmask that is
- * considered for it. Determine if @_cbm overlaps with any existing
+ * @d represents a cache instance and @cbm a capacity bitmask that is
+ * considered for it. Determine if @cbm overlaps with any existing
* pseudo-locked region on @d.
*
- * Return: true if @_cbm overlaps with pseudo-locked region on @d, false
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: true if @cbm overlaps with pseudo-locked region on @d, false
* otherwise.
*/
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm)
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
{
- unsigned long *cbm = (unsigned long *)&_cbm;
- unsigned long *cbm_b;
unsigned int cbm_len;
+ unsigned long cbm_b;
if (d->plr) {
cbm_len = d->plr->r->cache.cbm_len;
- cbm_b = (unsigned long *)&d->plr->cbm;
- if (bitmap_intersects(cbm, cbm_b, cbm_len))
+ cbm_b = d->plr->cbm;
+ if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
return true;
}
return false;
@@ -886,31 +880,14 @@ static int measure_cycles_lat_fn(void *_plr)
struct pseudo_lock_region *plr = _plr;
unsigned long i;
u64 start, end;
-#ifdef CONFIG_KASAN
- /*
- * The registers used for local register variables are also used
- * when KASAN is active. When KASAN is active we use a regular
- * variable to ensure we always use a valid pointer to access memory.
- * The cost is that accessing this pointer, which could be in
- * cache, will be included in the measurement of memory read latency.
- */
void *mem_r;
-#else
-#ifdef CONFIG_X86_64
- register void *mem_r asm("rbx");
-#else
- register void *mem_r asm("ebx");
-#endif /* CONFIG_X86_64 */
-#endif /* CONFIG_KASAN */
local_irq_disable();
/*
- * The wrmsr call may be reordered with the assignment below it.
- * Call wrmsr as directly as possible to avoid tracing clobbering
- * local register variable used for memory pointer.
+ * Disable hardware prefetchers.
*/
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
- mem_r = plr->kmem;
+ wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ mem_r = READ_ONCE(plr->kmem);
/*
* Dummy execute of the time measurement to load the needed
* instructions into the L1 instruction cache.
@@ -932,157 +909,240 @@ static int measure_cycles_lat_fn(void *_plr)
return 0;
}
-static int measure_cycles_perf_fn(void *_plr)
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+struct residency_counts {
+ u64 miss_before, hits_before;
+ u64 miss_after, hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+ struct perf_event_attr *hit_attr,
+ struct pseudo_lock_region *plr,
+ struct residency_counts *counts)
{
- unsigned long long l3_hits = 0, l3_miss = 0;
- u64 l3_hit_bits = 0, l3_miss_bits = 0;
- struct pseudo_lock_region *plr = _plr;
- unsigned long long l2_hits, l2_miss;
- u64 l2_hit_bits, l2_miss_bits;
- unsigned long i;
-#ifdef CONFIG_KASAN
- /*
- * The registers used for local register variables are also used
- * when KASAN is active. When KASAN is active we use regular variables
- * at the cost of including cache access latency to these variables
- * in the measurements.
- */
+ u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+ struct perf_event *miss_event, *hit_event;
+ int hit_pmcnum, miss_pmcnum;
unsigned int line_size;
unsigned int size;
+ unsigned long i;
void *mem_r;
-#else
- register unsigned int line_size asm("esi");
- register unsigned int size asm("edi");
-#ifdef CONFIG_X86_64
- register void *mem_r asm("rbx");
-#else
- register void *mem_r asm("ebx");
-#endif /* CONFIG_X86_64 */
-#endif /* CONFIG_KASAN */
+ u64 tmp;
+
+ miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(miss_event))
+ goto out;
+
+ hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(hit_event))
+ goto out_miss;
+
+ local_irq_disable();
+ /*
+ * Check any possible error state of events used by performing
+ * one local read.
+ */
+ if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+ if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+
+ /*
+ * Disable hardware prefetchers.
+ */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+
+ /* Initialize rest of local variables */
+ /*
+ * Performance event has been validated right before this with
+ * interrupts disabled - it is thus safe to read the counter index.
+ */
+ miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+ hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+ line_size = READ_ONCE(plr->line_size);
+ mem_r = READ_ONCE(plr->kmem);
+ size = READ_ONCE(plr->size);
+
+ /*
+ * Read counter variables twice - first to load the instructions
+ * used in L1 cache, second to capture accurate value that does not
+ * include cache misses incurred because of instruction loads.
+ */
+ rdpmcl(hit_pmcnum, hits_before);
+ rdpmcl(miss_pmcnum, miss_before);
+ /*
+ * From SDM: Performing back-to-back fast reads are not guaranteed
+ * to be monotonic.
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ rdpmcl(hit_pmcnum, hits_before);
+ rdpmcl(miss_pmcnum, miss_before);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ rdpmcl(hit_pmcnum, hits_after);
+ rdpmcl(miss_pmcnum, miss_after);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ /* Re-enable hardware prefetchers */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+out_hit:
+ perf_event_release_kernel(hit_event);
+out_miss:
+ perf_event_release_kernel(miss_event);
+out:
+ /*
+ * All counts will be zero on failure.
+ */
+ counts->miss_before = miss_before;
+ counts->hits_before = hits_before;
+ counts->miss_after = miss_after;
+ counts->hits_after = hits_after;
+ return 0;
+}
+
+static int measure_l2_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
/*
* Non-architectural event for the Goldmont Microarchitecture
* from Intel x86 Architecture Software Developer Manual (SDM):
* MEM_LOAD_UOPS_RETIRED D1H (event number)
* Umask values:
- * L1_HIT 01H
* L2_HIT 02H
- * L1_MISS 08H
* L2_MISS 10H
- *
- * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
- * has two "no fix" errata associated with it: BDM35 and BDM100. On
- * this platform we use the following events instead:
- * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
- * REFERENCES FFH
- * MISS 3FH
- * LONGEST_LAT_CACHE 2EH (Documented in SDM)
- * REFERENCE 4FH
- * MISS 41H
*/
-
- /*
- * Start by setting flags for IA32_PERFEVTSELx:
- * OS (Operating system mode) 0x2
- * INT (APIC interrupt enable) 0x10
- * EN (Enable counter) 0x40
- *
- * Then add the Umask value and event number to select performance
- * event.
- */
-
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
- l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
- l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
- break;
- case INTEL_FAM6_BROADWELL_X:
- /* On BDW the l2_hit_bits count references, not hits */
- l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
- l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
- /* On BDW the l3_hit_bits count references, not hits */
- l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
- l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x10);
+ perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x2);
break;
default:
goto out;
}
- local_irq_disable();
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
- * Call wrmsr direcly to avoid the local register variables from
- * being overwritten due to reordering of their assignment with
- * the wrmsr calls.
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
*/
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
- /* Disable events and reset counters */
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
- }
- /* Set and enable the L2 counters */
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
- l3_hit_bits);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- l3_miss_bits);
- }
- mem_r = plr->kmem;
- size = plr->size;
- line_size = plr->line_size;
- for (i = 0; i < size; i += line_size) {
- asm volatile("mov (%0,%1,1), %%eax\n\t"
- :
- : "r" (mem_r), "r" (i)
- : "%eax", "memory");
- }
+ trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+ counts.miss_after - counts.miss_before);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+static int measure_l3_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
/*
- * Call wrmsr directly (no tracing) to not influence
- * the cache access counters as they are disabled.
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform the following events are used instead:
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
*/
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
- l2_hit_bits & ~(0x40ULL << 16));
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
- l2_miss_bits & ~(0x40ULL << 16));
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
- l3_hit_bits & ~(0x40ULL << 16));
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- l3_miss_bits & ~(0x40ULL << 16));
- }
- l2_hits = native_read_pmc(0);
- l2_miss = native_read_pmc(1);
- if (l3_hit_bits > 0) {
- l3_hits = native_read_pmc(2);
- l3_miss = native_read_pmc(3);
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_X:
+ /* On BDW the hit event counts references, not hits */
+ perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x4f);
+ perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x41);
+ break;
+ default:
+ goto out;
}
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
- local_irq_enable();
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
- * On BDW we count references and misses, need to adjust. Sometimes
- * the "hits" counter is a bit more than the references, for
- * example, x references but x + 1 hits. To not report invalid
- * hit values in this case we treat that as misses eaqual to
- * references.
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
- l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
- trace_pseudo_lock_l2(l2_hits, l2_miss);
- if (l3_hit_bits > 0) {
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
- l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
- trace_pseudo_lock_l3(l3_hits, l3_miss);
+
+ counts.miss_after -= counts.miss_before;
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ /*
+ * On BDW references and misses are counted, need to adjust.
+ * Sometimes the "hits" counter is a bit more than the
+ * references, for example, x references but x + 1 hits.
+ * To not report invalid hit values in this case we treat
+ * that as misses equal to references.
+ */
+ /* First compute the number of cache references measured */
+ counts.hits_after -= counts.hits_before;
+ /* Next convert references to cache hits */
+ counts.hits_after -= min(counts.miss_after, counts.hits_after);
+ } else {
+ counts.hits_after -= counts.hits_before;
}
+ trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
out:
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
@@ -1114,6 +1174,11 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
goto out;
}
+ if (!plr->d) {
+ ret = -ENODEV;
+ goto out;
+ }
+
plr->thread_done = 0;
cpu = cpumask_first(&plr->d->cpu_mask);
if (!cpu_online(cpu)) {
@@ -1121,13 +1186,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
goto out;
}
+ plr->cpu = cpu;
+
if (sel == 1)
thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
else if (sel == 2)
- thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
+ thread = kthread_create_on_node(measure_l2_residency, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else if (sel == 3)
+ thread = kthread_create_on_node(measure_l3_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
@@ -1171,7 +1243,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file,
buf[buf_size] = '\0';
ret = kstrtoint(buf, 10, &sel);
if (ret == 0) {
- if (sel != 1)
+ if (sel != 1 && sel != 2 && sel != 3)
return -EINVAL;
ret = debugfs_file_get(file->f_path.dentry);
if (ret)
@@ -1427,6 +1499,11 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
plr = rdtgrp->plr;
+ if (!plr->d) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
/*
* Task is required to run with affinity to the cpus associated
* with the pseudo-locked region. If this is not the case the task
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index b799c00bef09..f27b8115ffa2 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...)
* limited as the number of resources grows.
*/
static int closid_free_map;
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
static void closid_init(void)
{
@@ -111,6 +117,7 @@ static void closid_init(void)
/* CLOSID 0 is always reserved for the default group */
closid_free_map &= ~1;
+ closid_free_map_len = rdt_min_closid;
}
static int closid_alloc(void)
@@ -261,17 +268,27 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
struct rdtgroup *rdtgrp;
+ struct cpumask *mask;
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(&rdtgrp->plr->d->cpu_mask));
- else
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ mask = &rdtgrp->plr->d->cpu_mask;
+ seq_printf(s, is_cpu_list(of) ?
+ "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(mask));
+ }
+ } else {
seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
cpumask_pr_args(&rdtgrp->cpu_mask));
+ }
} else {
ret = -ENOENT;
}
@@ -802,7 +819,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
sw_shareable = 0;
exclusive = 0;
seq_printf(seq, "%d=", dom->id);
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (!closid_allocated(i))
continue;
mode = rdtgroup_mode_by_closid(i);
@@ -954,7 +971,78 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
}
/**
- * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * rdt_cdp_peer_get - Retrieve CDP peer if it exists
+ * @r: RDT resource to which RDT domain @d belongs
+ * @d: Cache instance for which a CDP peer is requested
+ * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
+ * Used to return the result.
+ * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
+ * Used to return the result.
+ *
+ * RDT resources are managed independently and by extension the RDT domains
+ * (RDT resource instances) are managed independently also. The Code and
+ * Data Prioritization (CDP) RDT resources, while managed independently,
+ * could refer to the same underlying hardware. For example,
+ * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
+ *
+ * When provided with an RDT resource @r and an instance of that RDT
+ * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
+ * resource and the exact instance that shares the same hardware.
+ *
+ * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
+ * If a CDP peer was found, @r_cdp will point to the peer RDT resource
+ * and @d_cdp will point to the peer RDT domain.
+ */
+static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
+ struct rdt_resource **r_cdp,
+ struct rdt_domain **d_cdp)
+{
+ struct rdt_resource *_r_cdp = NULL;
+ struct rdt_domain *_d_cdp = NULL;
+ int ret = 0;
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L3DATA:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+ break;
+ case RDT_RESOURCE_L3CODE:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+ break;
+ case RDT_RESOURCE_L2DATA:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE];
+ break;
+ case RDT_RESOURCE_L2CODE:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA];
+ break;
+ default:
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * When a new CPU comes online and CDP is enabled then the new
+ * RDT domains (if any) associated with both CDP RDT resources
+ * are added in the same CPU online routine while the
+ * rdtgroup_mutex is held. It should thus not happen for one
+ * RDT domain to exist and be associated with its RDT CDP
+ * resource but there is no RDT domain associated with the
+ * peer RDT CDP resource. Hence the WARN.
+ */
+ _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
+ if (WARN_ON(!_d_cdp)) {
+ _r_cdp = NULL;
+ ret = -EINVAL;
+ }
+
+out:
+ *r_cdp = _r_cdp;
+ *d_cdp = _d_cdp;
+
+ return ret;
+}
+
+/**
+ * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
* @r: Resource to which domain instance @d belongs.
* @d: The domain instance for which @closid is being tested.
* @cbm: Capacity bitmask being tested.
@@ -968,33 +1056,34 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
* is false then overlaps with any resource group or hardware entities
* will be considered.
*
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
* Return: false if CBM does not overlap, true if it does.
*/
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
- u32 _cbm, int closid, bool exclusive)
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
{
- unsigned long *cbm = (unsigned long *)&_cbm;
- unsigned long *ctrl_b;
enum rdtgrp_mode mode;
+ unsigned long ctrl_b;
u32 *ctrl;
int i;
/* Check for any overlap with regions used by hardware directly */
if (!exclusive) {
- if (bitmap_intersects(cbm,
- (unsigned long *)&r->cache.shareable_bits,
- r->cache.cbm_len))
+ ctrl_b = r->cache.shareable_bits;
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
return true;
}
/* Check for overlap with other resource groups */
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
- ctrl_b = (unsigned long *)ctrl;
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ ctrl_b = *ctrl;
mode = rdtgroup_mode_by_closid(i);
if (closid_allocated(i) && i != closid &&
mode != RDT_MODE_PSEUDO_LOCKSETUP) {
- if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) {
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
if (exclusive) {
if (mode == RDT_MODE_EXCLUSIVE)
return true;
@@ -1009,6 +1098,41 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
}
/**
+ * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Resources that can be allocated using a CBM can use the CBM to control
+ * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
+ * for overlap. Overlap test is not limited to the specific resource for
+ * which the CBM is intended though - when dealing with CDP resources that
+ * share the underlying hardware the overlap check should be performed on
+ * the CDP resource sharing the hardware also.
+ *
+ * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
+ * overlap test.
+ *
+ * Return: true if CBM overlap detected, false if there is no overlap
+ */
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
+{
+ struct rdt_resource *r_cdp;
+ struct rdt_domain *d_cdp;
+
+ if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
+ return true;
+
+ if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
+ return false;
+
+ return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
+}
+
+/**
* rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
*
* An exclusive resource group implies that there should be no sharing of
@@ -1024,16 +1148,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
struct rdt_resource *r;
+ bool has_cache = false;
struct rdt_domain *d;
for_each_alloc_enabled_rdt_resource(r) {
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ has_cache = true;
list_for_each_entry(d, &r->domains, list) {
if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
- rdtgrp->closid, false))
+ rdtgrp->closid, false)) {
+ rdt_last_cmd_puts("schemata overlaps\n");
return false;
+ }
}
}
+ if (!has_cache) {
+ rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
return true;
}
@@ -1085,7 +1220,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
rdtgrp->mode = RDT_MODE_SHAREABLE;
} else if (!strcmp(buf, "exclusive")) {
if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- rdt_last_cmd_printf("schemata overlaps\n");
ret = -EINVAL;
goto out;
}
@@ -1121,15 +1255,18 @@ out:
* computed by first dividing the total cache size by the CBM length to
* determine how many bytes each bit in the bitmask represents. The result
* is multiplied with the number of bits set in the bitmask.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used to make the
+ * bitmap functions work correctly.
*/
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_domain *d, u32 cbm)
+ struct rdt_domain *d, unsigned long cbm)
{
struct cpu_cacheinfo *ci;
unsigned int size = 0;
int num_b, i;
- num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len);
+ num_b = bitmap_weight(&cbm, r->cache.cbm_len);
ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == r->cache_level) {
@@ -1155,8 +1292,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
struct rdt_resource *r;
struct rdt_domain *d;
unsigned int size;
- bool sep = false;
- u32 cbm;
+ int ret = 0;
+ bool sep;
+ u32 ctrl;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -1165,15 +1303,23 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
}
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name);
- size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
- rdtgrp->plr->d,
- rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%*s:", max_name_width,
+ rdtgrp->plr->r->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+ rdtgrp->plr->d,
+ rdtgrp->plr->cbm);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ }
goto out;
}
for_each_alloc_enabled_rdt_resource(r) {
+ sep = false;
seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(d, &r->domains, list) {
if (sep)
@@ -1181,8 +1327,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
size = 0;
} else {
- cbm = d->ctrl_val[rdtgrp->closid];
- size = rdtgroup_cbm_to_size(r, d, cbm);
+ ctrl = (!is_mba_sc(r) ?
+ d->ctrl_val[rdtgrp->closid] :
+ d->mbps_val[rdtgrp->closid]);
+ if (r->rid == RDT_RESOURCE_MBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
}
seq_printf(s, "%d=%u", d->id, size);
sep = true;
@@ -1193,7 +1344,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
out:
rdtgroup_kn_unlock(of->kn);
- return 0;
+ return ret;
}
/* rdtgroup information files for one cache resource. */
@@ -2327,28 +2478,48 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
*/
static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
{
+ struct rdt_resource *r_cdp = NULL;
+ struct rdt_domain *d_cdp = NULL;
u32 used_b = 0, unused_b = 0;
u32 closid = rdtgrp->closid;
struct rdt_resource *r;
+ unsigned long tmp_cbm;
enum rdtgrp_mode mode;
struct rdt_domain *d;
+ u32 peer_ctl, *ctrl;
int i, ret;
- u32 *ctrl;
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
list_for_each_entry(d, &r->domains, list) {
+ rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
d->have_new_ctrl = false;
d->new_ctrl = r->cache.shareable_bits;
used_b = r->cache.shareable_bits;
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (closid_allocated(i) && i != closid) {
mode = rdtgroup_mode_by_closid(i);
if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
break;
- used_b |= *ctrl;
+ /*
+ * If CDP is active include peer
+ * domain's usage to ensure there
+ * is no overlap with an exclusive
+ * group.
+ */
+ if (d_cdp)
+ peer_ctl = d_cdp->ctrl_val[i];
+ else
+ peer_ctl = 0;
+ used_b |= *ctrl | peer_ctl;
if (mode == RDT_MODE_SHAREABLE)
- d->new_ctrl |= *ctrl;
+ d->new_ctrl |= *ctrl | peer_ctl;
}
}
if (d->plr && d->plr->cbm > 0)
@@ -2361,9 +2532,14 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
* modify the CBM based on system availability.
*/
cbm_ensure_valid(&d->new_ctrl, r);
- if (bitmap_weight((unsigned long *) &d->new_ctrl,
- r->cache.cbm_len) <
- r->cache.min_cbm_bits) {
+ /*
+ * Assign the u32 CBM to an unsigned long to ensure
+ * that bitmap_weight() does not access out-of-bound
+ * memory.
+ */
+ tmp_cbm = d->new_ctrl;
+ if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
+ r->cache.min_cbm_bits) {
rdt_last_cmd_printf("no space on %s:%d\n",
r->name, d->id);
return -ENOSPC;
@@ -2373,6 +2549,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
ret = update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("failed to initialize allocations\n");
@@ -2760,6 +2942,13 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
seq_puts(seq, ",cdp");
+
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ seq_puts(seq, ",cdpl2");
+
+ if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ seq_puts(seq, ",mba_MBps");
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
index 97685a0c3175..27f394ac983f 100644
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -38,9 +38,6 @@ static struct mce_log_buffer mcelog = {
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-/* User mode helper program triggered by machine check event */
-extern char mce_helper[128];
-
static int dev_mce_log(struct notifier_block *nb, unsigned long val,
void *data)
{
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index c805a06e14c3..1fc424c40a31 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -108,6 +108,9 @@ static void setup_inj_struct(struct mce *m)
memset(m, 0, sizeof(struct mce));
m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->time = ktime_get_real_seconds();
+ m->cpuid = cpuid_eax(1);
+ m->microcode = boot_cpu_data.microcode;
}
/* Update fake mce registers on current CPU. */
@@ -576,6 +579,9 @@ static int inj_bank_set(void *data, u64 val)
m->bank = val;
do_inject();
+ /* Reset injection struct */
+ setup_inj_struct(&i_mce);
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index f34d89c01edc..44396d521987 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -336,7 +336,8 @@ int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
void __init mcheck_vendor_init_severity(void)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
mce_severity = mce_severity_amd;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 953b3ce92dcc..8cb3c02980cf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -270,7 +270,7 @@ static void print_mce(struct mce *m)
{
__print_mce(m);
- if (m->cpuvendor != X86_VENDOR_AMD)
+ if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
@@ -508,9 +508,9 @@ static int mce_usable_address(struct mce *m)
bool mce_is_memory_error(struct mce *m)
{
- if (m->cpuvendor == X86_VENDOR_AMD) {
+ if (m->cpuvendor == X86_VENDOR_AMD ||
+ m->cpuvendor == X86_VENDOR_HYGON) {
return amd_mce_is_memory_error(m);
-
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -539,6 +539,9 @@ static bool mce_is_correctable(struct mce *m)
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
return false;
+ if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
if (m->status & MCI_STATUS_UC)
return false;
@@ -1315,7 +1318,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
local_irq_disable();
ist_end_non_atomic();
} else {
- if (!fixup_exception(regs, X86_TRAP_MC))
+ if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
mce_panic("Failed kernel mode recovery", &m, NULL);
}
@@ -1705,7 +1708,7 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
*/
static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
@@ -1746,6 +1749,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_amd_feature_init(c);
break;
}
+
+ case X86_VENDOR_HYGON:
+ mce_hygon_feature_init(c);
+ break;
+
case X86_VENDOR_CENTAUR:
mce_centaur_feature_init(c);
break;
@@ -1971,12 +1979,14 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
+ * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
+ * are socket-wide.
* Disabling them for just a single offlined CPU is bad, since it will
* inhibit reporting for all shared resources on the socket like the
* last level cache (LLC), the integrated memory controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 0624957aa068..07b5fc00b188 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -504,6 +504,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
+ enum ucode_state ret;
u32 rev, dummy;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -521,9 +522,8 @@ static enum ucode_state apply_microcode_amd(int cpu)
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
- c->microcode = rev;
- uci->cpu_sig.rev = rev;
- return UCODE_OK;
+ ret = UCODE_OK;
+ goto out;
}
if (__apply_microcode_amd(mc_amd)) {
@@ -531,13 +531,21 @@ static enum ucode_state apply_microcode_amd(int cpu)
cpu, mc_amd->hdr.patch_id);
return UCODE_ERROR;
}
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
- mc_amd->hdr.patch_id);
- uci->cpu_sig.rev = mc_amd->hdr.patch_id;
- c->microcode = mc_amd->hdr.patch_id;
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
- return UCODE_UPDATED;
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
}
static int install_equiv_cpu_table(const u8 *buf)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 97ccf4c3b45b..16936a24795c 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -795,6 +795,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_intel *mc;
+ enum ucode_state ret;
static int prev_rev;
u32 rev;
@@ -817,9 +818,8 @@ static enum ucode_state apply_microcode_intel(int cpu)
*/
rev = intel_get_microcode_revision();
if (rev >= mc->hdr.rev) {
- uci->cpu_sig.rev = rev;
- c->microcode = rev;
- return UCODE_OK;
+ ret = UCODE_OK;
+ goto out;
}
/*
@@ -848,10 +848,17 @@ static enum ucode_state apply_microcode_intel(int cpu)
prev_rev = rev;
}
+ ret = UCODE_UPDATED;
+
+out:
uci->cpu_sig.rev = rev;
- c->microcode = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
- return UCODE_UPDATED;
+ return ret;
}
static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ad12733f6058..1c72f3819eb1 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -199,6 +199,16 @@ static unsigned long hv_get_tsc_khz(void)
return freq / 1000;
}
+#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV)
+static void __init hv_smp_prepare_boot_cpu(void)
+{
+ native_smp_prepare_boot_cpu();
+#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ hv_init_spinlocks();
+#endif
+}
+#endif
+
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
@@ -303,6 +313,10 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
hv_stimer0_callback_vector);
+
+# ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+# endif
#endif
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 765afd599039..3668c5df90c6 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -831,7 +831,8 @@ int __init amd_special_default_mtrr(void)
{
u32 l, h;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
if (boot_cpu_data.x86 < 0xf)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 9a19c800fe40..507039c20128 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -127,7 +127,7 @@ static void __init set_num_var_ranges(void)
if (use_intel())
rdmsr(MSR_MTRRcap, config, dummy);
- else if (is_cpu(AMD))
+ else if (is_cpu(AMD) || is_cpu(HYGON))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
config = 8;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d389083330c5..9556930cd8c1 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTR)
return (msr - MSR_F15H_PERF_CTR) >> 1;
@@ -74,6 +75,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTL)
return (msr - MSR_F15H_PERF_CTL) >> 1;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 8e005329648b..d9ab49bed8af 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -97,14 +97,14 @@ static void __init vmware_sched_clock_setup(void)
d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
d->cyc2ns_shift);
- pv_time_ops.sched_clock = vmware_sched_clock;
+ pv_ops.time.sched_clock = vmware_sched_clock;
pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
}
static void __init vmware_paravirt_ops_setup(void)
{
pv_info.name = "VMware hypervisor";
- pv_cpu_ops.io_delay = paravirt_nop;
+ pv_ops.cpu.io_delay = paravirt_nop;
if (vmware_tsc_khz && vmw_sched_clock)
vmware_sched_clock_setup();
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..eb8ab3915268 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
#include <linux/uaccess.h>
#include <linux/io.h>
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf,
+ bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
- vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (encrypted)
+ vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+ else
+ vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, vaddr + offset, csize)) {
- iounmap(vaddr);
+ if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+ iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
- iounmap(vaddr);
+ iounmap((void __iomem *)vaddr);
return csize;
}
+
+/**
+ * copy_oldmem_page - copy one page of memory
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from the old kernel's memory. For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
+{
+ return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+}
+
+/**
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
+ * memory with the encryption mask set to accomodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
+{
+ return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c8652974f8e..2b5886401e5f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,6 +17,7 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <linux/kasan.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
@@ -89,14 +90,24 @@ static void printk_stack_address(unsigned long address, int reliable,
* Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
* guesstimate in attempt to achieve all of the above.
*/
-void show_opcodes(u8 *rip, const char *loglvl)
+void show_opcodes(struct pt_regs *regs, const char *loglvl)
{
#define PROLOGUE_SIZE 42
#define EPILOGUE_SIZE 21
#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
u8 opcodes[OPCODE_BUFSIZE];
+ unsigned long prologue = regs->ip - PROLOGUE_SIZE;
+ bool bad_ip;
- if (probe_kernel_read(opcodes, rip - PROLOGUE_SIZE, OPCODE_BUFSIZE)) {
+ /*
+ * Make sure userspace isn't trying to trick us into dumping kernel
+ * memory by pointing the userspace instruction pointer at it.
+ */
+ bad_ip = user_mode(regs) &&
+ __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX);
+
+ if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue,
+ OPCODE_BUFSIZE)) {
printk("%sCode: Bad RIP value.\n", loglvl);
} else {
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
@@ -112,7 +123,7 @@ void show_ip(struct pt_regs *regs, const char *loglvl)
#else
printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
#endif
- show_opcodes((u8 *)regs->ip, loglvl);
+ show_opcodes(regs, loglvl);
}
void show_iret_regs(struct pt_regs *regs)
@@ -135,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* they can be printed in the right context.
*/
if (!partial && on_stack(info, regs, sizeof(*regs))) {
- __show_regs(regs, 0);
+ __show_regs(regs, SHOW_REGS_SHORT);
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
@@ -333,7 +344,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
oops_exit();
/* Executive summary in case the oops scrolled away */
- __show_regs(&exec_summary_regs, true);
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL);
if (!signr)
return;
@@ -346,7 +357,10 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
* We're not going to return, but we might be on an IST stack or
* have very little stack space left. Rewind the stack and kill
* the task.
+ * Before we rewind the stack, we have to tell KASAN that we're going to
+ * reuse the task stack and that existing poisons are invalid.
*/
+ kasan_unpoison_task_stack(current);
rewind_stack_do_exit(signr);
}
NOKPROBE_SYMBOL(oops_end);
@@ -393,14 +407,9 @@ void die(const char *str, struct pt_regs *regs, long err)
void show_regs(struct pt_regs *regs)
{
- bool all = true;
-
show_regs_print_info(KERN_DEFAULT);
- if (IS_ENABLED(CONFIG_X86_32))
- all = !user_mode(regs);
-
- __show_regs(regs, all);
+ __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL);
/*
* When in-kernel, we also print out the stack at the time of the fault..
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index f260e452e4f8..e8c8c5d78dbd 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -7,11 +7,17 @@
#include <linux/eisa.h>
#include <linux/io.h>
+#include <xen/xen.h>
+
static __init int eisa_bus_probe(void)
{
- void __iomem *p = ioremap(0x0FFFD9, 4);
+ void __iomem *p;
+
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
- if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
+ p = ioremap(0x0FFFD9, 4);
+ if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
EISA_bus = 1;
iounmap(p);
return 0;
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 23f1691670b6..61a949d84dfa 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -314,7 +314,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
* thread's fpu state, reconstruct fxstate from the fsave
* header. Validate and sanitize the copied state.
*/
- struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
int err = 0;
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ec6fefbfd3c0..76fa3b836598 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -37,6 +37,7 @@ asmlinkage __visible void __init i386_start_kernel(void)
cr4_init_shadow();
sanitize_boot_params(&boot_params);
+ x86_verify_bootdata_version();
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379e575a..5dc377dc9d7b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -35,6 +35,7 @@
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
#include <asm/kasan.h>
+#include <asm/fixmap.h>
/*
* Manage page tables very early on.
@@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
+ unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pud[511] += load_delta;
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
- pmd[506] += load_delta;
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ pmd[i] += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr,
sme_encrypt_kernel(bp);
/*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
@@ -439,6 +457,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
if (!boot_params.hdr.version)
copy_bootdata(__va(real_mode_data));
+ x86_verify_bootdata_version();
+
x86_early_init_platform_quirks();
switch (boot_params.hdr.hardware_subarch) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 15ebc2fc166e..747c758f67b7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -24,8 +24,9 @@
#include "../entry/calling.h"
#include <asm/export.h>
#include <asm/nospec-branch.h>
+#include <asm/fixmap.h>
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
@@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt)
KERNEL_IMAGE_SIZE/PMD_SIZE)
NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
NEXT_PAGE(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
.fill 512,8,0
+ .endr
#undef PMDS
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index eeea935e9bb5..aac0c1f7e354 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -42,55 +42,40 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
void *(*poker)(void *, const void *, size_t),
int init)
{
- union jump_code_union code;
+ union jump_code_union jmp;
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+ const void *expect, *code;
+ int line;
+
+ jmp.jump = 0xe9;
+ jmp.offset = jump_entry_target(entry) -
+ (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
if (early_boot_irqs_disabled)
poker = text_poke_early;
if (type == JUMP_LABEL_JMP) {
if (init) {
- /*
- * Jump label is enabled for the first time.
- * So we expect a default_nop...
- */
- if (unlikely(memcmp((void *)entry->code, default_nop, 5)
- != 0))
- bug_at((void *)entry->code, __LINE__);
+ expect = default_nop; line = __LINE__;
} else {
- /*
- * ...otherwise expect an ideal_nop. Otherwise
- * something went horribly wrong.
- */
- if (unlikely(memcmp((void *)entry->code, ideal_nop, 5)
- != 0))
- bug_at((void *)entry->code, __LINE__);
+ expect = ideal_nop; line = __LINE__;
}
- code.jump = 0xe9;
- code.offset = entry->target -
- (entry->code + JUMP_LABEL_NOP_SIZE);
+ code = &jmp.code;
} else {
- /*
- * We are disabling this jump label. If it is not what
- * we think it is, then something must have gone wrong.
- * If this is the first initialization call, then we
- * are converting the default nop to the ideal nop.
- */
if (init) {
- if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
- bug_at((void *)entry->code, __LINE__);
+ expect = default_nop; line = __LINE__;
} else {
- code.jump = 0xe9;
- code.offset = entry->target -
- (entry->code + JUMP_LABEL_NOP_SIZE);
- if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
- bug_at((void *)entry->code, __LINE__);
+ expect = &jmp.code; line = __LINE__;
}
- memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+
+ code = ideal_nop;
}
+ if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
+ bug_at((void *)jump_entry_code(entry), line);
+
/*
* Make text_poke_bp() a default fallback poker.
*
@@ -99,11 +84,14 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*
*/
- if (poker)
- (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
- else
- text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
- (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+ if (poker) {
+ (*poker)((void *)jump_entry_code(entry), code,
+ JUMP_LABEL_NOP_SIZE);
+ return;
+ }
+
+ text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE,
+ (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
}
void arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index b0d1e81c96bb..c33b06f5faa4 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1020,64 +1020,18 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
*/
if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
return 1;
-
- /*
- * In case the user-specified fault handler returned
- * zero, try to fix up.
- */
- if (fixup_exception(regs, trapnr))
- return 1;
-
- /*
- * fixup routine could not handle it,
- * Let do_page_fault() fix it.
- */
}
return 0;
}
NOKPROBE_SYMBOL(kprobe_fault_handler);
-/*
- * Wrapper routine for handling exceptions.
- */
-int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
- void *data)
-{
- struct die_args *args = data;
- int ret = NOTIFY_DONE;
-
- if (args->regs && user_mode(args->regs))
- return ret;
-
- if (val == DIE_GPF) {
- /*
- * To be potentially processing a kprobe fault and to
- * trust the result from kprobe_running(), we have
- * be non-preemptible.
- */
- if (!preemptible() && kprobe_running() &&
- kprobe_fault_handler(args->regs, args->trapnr))
- ret = NOTIFY_STOP;
- }
- return ret;
-}
-NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-
bool arch_within_kprobe_blacklist(unsigned long addr)
{
- bool is_in_entry_trampoline_section = false;
-
-#ifdef CONFIG_X86_64
- is_in_entry_trampoline_section =
- (addr >= (unsigned long)__entry_trampoline_start &&
- addr < (unsigned long)__entry_trampoline_end);
-#endif
return (addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end) ||
(addr >= (unsigned long)__entry_text_start &&
- addr < (unsigned long)__entry_text_end) ||
- is_in_entry_trampoline_section;
+ addr < (unsigned long)__entry_text_end);
}
int __init arch_init_kprobes(void)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index eaf02f2e7300..40b16b270656 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -179,7 +179,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
opt_pre_handler(&op->kp, regs);
__this_cpu_write(current_kprobe, NULL);
}
- preempt_enable_no_resched();
+ preempt_enable();
}
NOKPROBE_SYMBOL(optimized_callback);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d9b71924c23c..ba4bfb7f6a36 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -283,7 +283,7 @@ static void __init paravirt_ops_setup(void)
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
- pv_cpu_ops.io_delay = kvm_io_delay;
+ pv_ops.cpu.io_delay = kvm_io_delay;
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
@@ -632,14 +632,14 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
- pv_time_ops.steal_clock = kvm_steal_clock;
+ pv_ops.time.steal_clock = kvm_steal_clock;
}
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
- pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+ pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -850,13 +850,14 @@ void __init kvm_spinlock_init(void)
return;
__pv_init_lock_hash();
- pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
- pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
- pv_lock_ops.wait = kvm_wait;
- pv_lock_ops.kick = kvm_kick_cpu;
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = kvm_wait;
+ pv_ops.lock.kick = kvm_kick_cpu;
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_lock_ops.vcpu_is_preempted =
+ pv_ops.lock.vcpu_is_preempted =
PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e6764648af3..30084ecaa20f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/set_memory.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
@@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
static struct pvclock_vsyscall_time_info
- hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+static struct pvclock_vsyscall_time_info *hvclock_mem;
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
@@ -116,13 +118,13 @@ static u64 kvm_sched_clock_read(void)
static inline void kvm_sched_clock_init(bool stable)
{
if (!stable) {
- pv_time_ops.sched_clock = kvm_clock_read;
+ pv_ops.time.sched_clock = kvm_clock_read;
clear_sched_clock_stable();
return;
}
kvm_sched_clock_offset = kvm_clock_read();
- pv_time_ops.sched_clock = kvm_sched_clock_read;
+ pv_ops.time.sched_clock = kvm_sched_clock_read;
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
@@ -236,6 +238,45 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
+static void __init kvmclock_init_mem(void)
+{
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (sev_active()) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
+}
+
static int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
@@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
+
+ kvmclock_init_mem();
+
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);
@@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu)
/* Use the static page for the first CPUs, allocate otherwise */
if (cpu < HVC_BOOT_ARRAY_SIZE)
p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
else
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ return -ENOMEM;
per_cpu(hv_clock_per_cpu, cpu) = p;
return p ? 0 : -ENOMEM;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 733e6ace0fa4..ab18e0884dc6 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
- flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot;
return 0;
diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S
new file mode 100644
index 000000000000..161c95059044
--- /dev/null
+++ b/arch/x86/kernel/macros.S
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * This file includes headers whose assembly part includes macros which are
+ * commonly used. The macros are precompiled into assmebly file which is later
+ * assembled together with each compiled file.
+ */
+
+#include <linux/compiler.h>
+#include <asm/refcount.h>
+#include <asm/alternative-asm.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
+#include <asm/jump_label.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f58336af095c..b052e883dd8c 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -201,6 +201,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
goto overflow;
#endif
break;
+ case R_X86_64_PC64:
+ if (*(u64 *)loc != 0)
+ goto invalid_relocation;
+ val -= (u64)loc;
+ *(u64 *)loc = val;
+ break;
default:
pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 71f2d1125ec0..4f75d0cf6305 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -17,7 +17,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
bool pv_is_native_spin_unlock(void)
{
- return pv_lock_ops.queued_spin_unlock.func ==
+ return pv_ops.lock.queued_spin_unlock.func ==
__raw_callee_save___native_queued_spin_unlock;
}
@@ -29,17 +29,6 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
bool pv_is_native_vcpu_is_preempted(void)
{
- return pv_lock_ops.vcpu_is_preempted.func ==
+ return pv_ops.lock.vcpu_is_preempted.func ==
__raw_callee_save___native_vcpu_is_preempted;
}
-
-struct pv_lock_ops pv_lock_ops = {
-#ifdef CONFIG_SMP
- .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
- .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
- .wait = paravirt_nop,
- .kick = paravirt_nop,
- .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
-#endif /* SMP */
-};
-EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index afdb303285f8..e4d4df37922a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -81,17 +81,15 @@ struct branch {
u32 delta;
} __attribute__((packed));
-unsigned paravirt_patch_call(void *insnbuf,
- const void *target, u16 tgt_clobbers,
- unsigned long addr, u16 site_clobbers,
- unsigned len)
+static unsigned paravirt_patch_call(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
@@ -103,15 +101,16 @@ unsigned paravirt_patch_call(void *insnbuf,
return 5;
}
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
- unsigned long addr, unsigned len)
+#ifdef CONFIG_PARAVIRT_XXL
+static unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
@@ -121,6 +120,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
return 5;
}
+#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -130,29 +130,14 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
-/*
- * Neat trick to map patch type back to the call within the
- * corresponding structure.
- */
-static void *get_call_destination(u8 type)
-{
- struct paravirt_patch_template tmpl = {
- .pv_init_ops = pv_init_ops,
- .pv_time_ops = pv_time_ops,
- .pv_cpu_ops = pv_cpu_ops,
- .pv_irq_ops = pv_irq_ops,
- .pv_mmu_ops = pv_mmu_ops,
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- .pv_lock_ops = pv_lock_ops,
-#endif
- };
- return *((void **)&tmpl + type);
-}
-
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+unsigned paravirt_patch_default(u8 type, void *insnbuf,
unsigned long addr, unsigned len)
{
- void *opfunc = get_call_destination(type);
+ /*
+ * Neat trick to map patch type back to the call within the
+ * corresponding structure.
+ */
+ void *opfunc = *((void **)&pv_ops + type);
unsigned ret;
if (opfunc == NULL)
@@ -167,15 +152,15 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insnbuf, len);
- else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
+#ifdef CONFIG_PARAVIRT_XXL
+ else if (type == PARAVIRT_PATCH(cpu.iret) ||
+ type == PARAVIRT_PATCH(cpu.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
+#endif
else
- /* Otherwise call the function; assume target could
- clobber any caller-save reg */
- ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
- addr, clobbers, len);
+ /* Otherwise call the function. */
+ ret = paravirt_patch_call(insnbuf, opfunc, addr, len);
return ret;
}
@@ -281,6 +266,7 @@ void paravirt_flush_lazy_mmu(void)
preempt_enable();
}
+#ifdef CONFIG_PARAVIRT_XXL
void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
@@ -301,6 +287,7 @@ void paravirt_end_context_switch(struct task_struct *next)
if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
arch_enter_lazy_mmu_mode();
}
+#endif
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
{
@@ -312,85 +299,16 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
struct pv_info pv_info = {
.name = "bare hardware",
+#ifdef CONFIG_PARAVIRT_XXL
.kernel_rpl = 0,
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = __USER_CS,
#endif
-};
-
-struct pv_init_ops pv_init_ops = {
- .patch = native_patch,
-};
-
-struct pv_time_ops pv_time_ops = {
- .sched_clock = native_sched_clock,
- .steal_clock = native_steal_clock,
-};
-
-__visible struct pv_irq_ops pv_irq_ops = {
- .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
- .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
- .safe_halt = native_safe_halt,
- .halt = native_halt,
-};
-
-__visible struct pv_cpu_ops pv_cpu_ops = {
- .cpuid = native_cpuid,
- .get_debugreg = native_get_debugreg,
- .set_debugreg = native_set_debugreg,
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
- .write_cr4 = native_write_cr4,
-#ifdef CONFIG_X86_64
- .read_cr8 = native_read_cr8,
- .write_cr8 = native_write_cr8,
#endif
- .wbinvd = native_wbinvd,
- .read_msr = native_read_msr,
- .write_msr = native_write_msr,
- .read_msr_safe = native_read_msr_safe,
- .write_msr_safe = native_write_msr_safe,
- .read_pmc = native_read_pmc,
- .load_tr_desc = native_load_tr_desc,
- .set_ldt = native_set_ldt,
- .load_gdt = native_load_gdt,
- .load_idt = native_load_idt,
- .store_tr = native_store_tr,
- .load_tls = native_load_tls,
-#ifdef CONFIG_X86_64
- .load_gs_index = native_load_gs_index,
-#endif
- .write_ldt_entry = native_write_ldt_entry,
- .write_gdt_entry = native_write_gdt_entry,
- .write_idt_entry = native_write_idt_entry,
-
- .alloc_ldt = paravirt_nop,
- .free_ldt = paravirt_nop,
-
- .load_sp0 = native_load_sp0,
-
-#ifdef CONFIG_X86_64
- .usergs_sysret64 = native_usergs_sysret64,
-#endif
- .iret = native_iret,
- .swapgs = native_swapgs,
-
- .set_iopl_mask = native_set_iopl_mask,
- .io_delay = native_io_delay,
-
- .start_context_switch = paravirt_nop,
- .end_context_switch = paravirt_nop,
};
-/* At this point, native_get/set_debugreg has real function entries */
-NOKPROBE_SYMBOL(native_get_debugreg);
-NOKPROBE_SYMBOL(native_set_debugreg);
-NOKPROBE_SYMBOL(native_load_idt);
-
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
/* 32-bit pagetable entries */
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
@@ -399,85 +317,171 @@ NOKPROBE_SYMBOL(native_load_idt);
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
#endif
-struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
+struct paravirt_patch_template pv_ops = {
+ /* Init ops. */
+ .init.patch = native_patch,
- .read_cr2 = native_read_cr2,
- .write_cr2 = native_write_cr2,
- .read_cr3 = __native_read_cr3,
- .write_cr3 = native_write_cr3,
+ /* Time ops. */
+ .time.sched_clock = native_sched_clock,
+ .time.steal_clock = native_steal_clock,
- .flush_tlb_user = native_flush_tlb,
- .flush_tlb_kernel = native_flush_tlb_global,
- .flush_tlb_one_user = native_flush_tlb_one_user,
- .flush_tlb_others = native_flush_tlb_others,
- .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+ /* Cpu ops. */
+ .cpu.io_delay = native_io_delay,
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
+#ifdef CONFIG_PARAVIRT_XXL
+ .cpu.cpuid = native_cpuid,
+ .cpu.get_debugreg = native_get_debugreg,
+ .cpu.set_debugreg = native_set_debugreg,
+ .cpu.read_cr0 = native_read_cr0,
+ .cpu.write_cr0 = native_write_cr0,
+ .cpu.write_cr4 = native_write_cr4,
+#ifdef CONFIG_X86_64
+ .cpu.read_cr8 = native_read_cr8,
+ .cpu.write_cr8 = native_write_cr8,
+#endif
+ .cpu.wbinvd = native_wbinvd,
+ .cpu.read_msr = native_read_msr,
+ .cpu.write_msr = native_write_msr,
+ .cpu.read_msr_safe = native_read_msr_safe,
+ .cpu.write_msr_safe = native_write_msr_safe,
+ .cpu.read_pmc = native_read_pmc,
+ .cpu.load_tr_desc = native_load_tr_desc,
+ .cpu.set_ldt = native_set_ldt,
+ .cpu.load_gdt = native_load_gdt,
+ .cpu.load_idt = native_load_idt,
+ .cpu.store_tr = native_store_tr,
+ .cpu.load_tls = native_load_tls,
+#ifdef CONFIG_X86_64
+ .cpu.load_gs_index = native_load_gs_index,
+#endif
+ .cpu.write_ldt_entry = native_write_ldt_entry,
+ .cpu.write_gdt_entry = native_write_gdt_entry,
+ .cpu.write_idt_entry = native_write_idt_entry,
- .alloc_pte = paravirt_nop,
- .alloc_pmd = paravirt_nop,
- .alloc_pud = paravirt_nop,
- .alloc_p4d = paravirt_nop,
- .release_pte = paravirt_nop,
- .release_pmd = paravirt_nop,
- .release_pud = paravirt_nop,
- .release_p4d = paravirt_nop,
+ .cpu.alloc_ldt = paravirt_nop,
+ .cpu.free_ldt = paravirt_nop,
- .set_pte = native_set_pte,
- .set_pte_at = native_set_pte_at,
- .set_pmd = native_set_pmd,
+ .cpu.load_sp0 = native_load_sp0,
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+#ifdef CONFIG_X86_64
+ .cpu.usergs_sysret64 = native_usergs_sysret64,
+#endif
+ .cpu.iret = native_iret,
+ .cpu.swapgs = native_swapgs,
+
+ .cpu.set_iopl_mask = native_set_iopl_mask,
+
+ .cpu.start_context_switch = paravirt_nop,
+ .cpu.end_context_switch = paravirt_nop,
+
+ /* Irq ops. */
+ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .irq.safe_halt = native_safe_halt,
+ .irq.halt = native_halt,
+#endif /* CONFIG_PARAVIRT_XXL */
+
+ /* Mmu ops. */
+ .mmu.flush_tlb_user = native_flush_tlb,
+ .mmu.flush_tlb_kernel = native_flush_tlb_global,
+ .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
+ .mmu.flush_tlb_others = native_flush_tlb_others,
+ .mmu.tlb_remove_table =
+ (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+
+ .mmu.exit_mmap = paravirt_nop,
+
+#ifdef CONFIG_PARAVIRT_XXL
+ .mmu.read_cr2 = native_read_cr2,
+ .mmu.write_cr2 = native_write_cr2,
+ .mmu.read_cr3 = __native_read_cr3,
+ .mmu.write_cr3 = native_write_cr3,
+
+ .mmu.pgd_alloc = __paravirt_pgd_alloc,
+ .mmu.pgd_free = paravirt_nop,
+
+ .mmu.alloc_pte = paravirt_nop,
+ .mmu.alloc_pmd = paravirt_nop,
+ .mmu.alloc_pud = paravirt_nop,
+ .mmu.alloc_p4d = paravirt_nop,
+ .mmu.release_pte = paravirt_nop,
+ .mmu.release_pmd = paravirt_nop,
+ .mmu.release_pud = paravirt_nop,
+ .mmu.release_p4d = paravirt_nop,
+
+ .mmu.set_pte = native_set_pte,
+ .mmu.set_pte_at = native_set_pte_at,
+ .mmu.set_pmd = native_set_pmd,
+
+ .mmu.ptep_modify_prot_start = __ptep_modify_prot_start,
+ .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit,
#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
- .set_pte_atomic = native_set_pte_atomic,
- .pte_clear = native_pte_clear,
- .pmd_clear = native_pmd_clear,
+ .mmu.set_pte_atomic = native_set_pte_atomic,
+ .mmu.pte_clear = native_pte_clear,
+ .mmu.pmd_clear = native_pmd_clear,
#endif
- .set_pud = native_set_pud,
+ .mmu.set_pud = native_set_pud,
- .pmd_val = PTE_IDENT,
- .make_pmd = PTE_IDENT,
+ .mmu.pmd_val = PTE_IDENT,
+ .mmu.make_pmd = PTE_IDENT,
#if CONFIG_PGTABLE_LEVELS >= 4
- .pud_val = PTE_IDENT,
- .make_pud = PTE_IDENT,
+ .mmu.pud_val = PTE_IDENT,
+ .mmu.make_pud = PTE_IDENT,
- .set_p4d = native_set_p4d,
+ .mmu.set_p4d = native_set_p4d,
#if CONFIG_PGTABLE_LEVELS >= 5
- .p4d_val = PTE_IDENT,
- .make_p4d = PTE_IDENT,
+ .mmu.p4d_val = PTE_IDENT,
+ .mmu.make_p4d = PTE_IDENT,
- .set_pgd = native_set_pgd,
+ .mmu.set_pgd = native_set_pgd,
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
- .pte_val = PTE_IDENT,
- .pgd_val = PTE_IDENT,
+ .mmu.pte_val = PTE_IDENT,
+ .mmu.pgd_val = PTE_IDENT,
- .make_pte = PTE_IDENT,
- .make_pgd = PTE_IDENT,
+ .mmu.make_pte = PTE_IDENT,
+ .mmu.make_pgd = PTE_IDENT,
- .dup_mmap = paravirt_nop,
- .exit_mmap = paravirt_nop,
- .activate_mm = paravirt_nop,
+ .mmu.dup_mmap = paravirt_nop,
+ .mmu.activate_mm = paravirt_nop,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- .flush = paravirt_nop,
+ .mmu.lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ .flush = paravirt_nop,
},
- .set_fixmap = native_set_fixmap,
+ .mmu.set_fixmap = native_set_fixmap,
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ /* Lock ops. */
+#ifdef CONFIG_SMP
+ .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .lock.wait = paravirt_nop,
+ .lock.kick = paravirt_nop,
+ .lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+#endif /* SMP */
+#endif
};
-EXPORT_SYMBOL_GPL(pv_time_ops);
-EXPORT_SYMBOL (pv_cpu_ops);
-EXPORT_SYMBOL (pv_mmu_ops);
+#ifdef CONFIG_PARAVIRT_XXL
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+#endif
+
+EXPORT_SYMBOL_GPL(pv_ops);
EXPORT_SYMBOL_GPL(pv_info);
-EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 758e69d72ebf..6368c22fa1fa 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/paravirt.h>
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+#ifdef CONFIG_PARAVIRT_XXL
+DEF_NATIVE(irq, irq_disable, "cli");
+DEF_NATIVE(irq, irq_enable, "sti");
+DEF_NATIVE(irq, restore_fl, "push %eax; popf");
+DEF_NATIVE(irq, save_fl, "pushf; pop %eax");
+DEF_NATIVE(cpu, iret, "iret");
+DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax");
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
-DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -30,53 +32,42 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
extern bool pv_is_native_spin_unlock(void);
extern bool pv_is_native_vcpu_is_preempted(void);
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
+unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
{
- const unsigned char *start, *end;
- unsigned ret;
-
#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
+ case PARAVIRT_PATCH(ops.x): \
+ return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x)
+
switch (type) {
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
+#ifdef CONFIG_PARAVIRT_XXL
+ PATCH_SITE(irq, irq_disable);
+ PATCH_SITE(irq, irq_enable);
+ PATCH_SITE(irq, restore_fl);
+ PATCH_SITE(irq, save_fl);
+ PATCH_SITE(cpu, iret);
+ PATCH_SITE(mmu, read_cr2);
+ PATCH_SITE(mmu, read_cr3);
+ PATCH_SITE(mmu, write_cr3);
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
- case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
- if (pv_is_native_spin_unlock()) {
- start = start_pv_lock_ops_queued_spin_unlock;
- end = end_pv_lock_ops_queued_spin_unlock;
- goto patch_site;
- }
- goto patch_default;
+ case PARAVIRT_PATCH(lock.queued_spin_unlock):
+ if (pv_is_native_spin_unlock())
+ return paravirt_patch_insns(ibuf, len,
+ start_lock_queued_spin_unlock,
+ end_lock_queued_spin_unlock);
+ break;
- case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
- if (pv_is_native_vcpu_is_preempted()) {
- start = start_pv_lock_ops_vcpu_is_preempted;
- end = end_pv_lock_ops_vcpu_is_preempted;
- goto patch_site;
- }
- goto patch_default;
+ case PARAVIRT_PATCH(lock.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted())
+ return paravirt_patch_insns(ibuf, len,
+ start_lock_vcpu_is_preempted,
+ end_lock_vcpu_is_preempted);
+ break;
#endif
default:
-patch_default: __maybe_unused
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
- break;
-
-patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
break;
}
#undef PATCH_SITE
- return ret;
+ return paravirt_patch_default(type, ibuf, addr, len);
}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 9cb98f7b07c9..7ca9cb726f4d 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -3,24 +3,26 @@
#include <asm/asm-offsets.h>
#include <linux/stringify.h>
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+#ifdef CONFIG_PARAVIRT_XXL
+DEF_NATIVE(irq, irq_disable, "cli");
+DEF_NATIVE(irq, irq_enable, "sti");
+DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(irq, save_fl, "pushfq; popq %rax");
+DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax");
+DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax");
+DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3");
+DEF_NATIVE(cpu, wbinvd, "wbinvd");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(cpu, swapgs, "swapgs");
+#endif
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
-DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -38,55 +40,44 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
extern bool pv_is_native_spin_unlock(void);
extern bool pv_is_native_vcpu_is_preempted(void);
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
+unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
{
- const unsigned char *start, *end;
- unsigned ret;
-
#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
- switch(type) {
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, usergs_sysret64);
- PATCH_SITE(pv_cpu_ops, swapgs);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, wbinvd);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS)
- case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
- if (pv_is_native_spin_unlock()) {
- start = start_pv_lock_ops_queued_spin_unlock;
- end = end_pv_lock_ops_queued_spin_unlock;
- goto patch_site;
- }
- goto patch_default;
+ case PARAVIRT_PATCH(ops.x): \
+ return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x)
- case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
- if (pv_is_native_vcpu_is_preempted()) {
- start = start_pv_lock_ops_vcpu_is_preempted;
- end = end_pv_lock_ops_vcpu_is_preempted;
- goto patch_site;
- }
- goto patch_default;
+ switch (type) {
+#ifdef CONFIG_PARAVIRT_XXL
+ PATCH_SITE(irq, restore_fl);
+ PATCH_SITE(irq, save_fl);
+ PATCH_SITE(irq, irq_enable);
+ PATCH_SITE(irq, irq_disable);
+ PATCH_SITE(cpu, usergs_sysret64);
+ PATCH_SITE(cpu, swapgs);
+ PATCH_SITE(cpu, wbinvd);
+ PATCH_SITE(mmu, read_cr2);
+ PATCH_SITE(mmu, read_cr3);
+ PATCH_SITE(mmu, write_cr3);
#endif
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(lock.queued_spin_unlock):
+ if (pv_is_native_spin_unlock())
+ return paravirt_patch_insns(ibuf, len,
+ start_lock_queued_spin_unlock,
+ end_lock_queued_spin_unlock);
+ break;
- default:
-patch_default: __maybe_unused
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ case PARAVIRT_PATCH(lock.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted())
+ return paravirt_patch_insns(ibuf, len,
+ start_lock_vcpu_is_preempted,
+ end_lock_vcpu_is_preempted);
break;
+#endif
-patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
+ default:
break;
}
#undef PATCH_SITE
- return ret;
+ return paravirt_patch_default(type, ibuf, addr, len);
}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 661583662430..71c0b01d93b1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,10 +42,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
int __init pci_swiotlb_detect_4gb(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-#ifdef CONFIG_X86_64
if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
swiotlb = 1;
-#endif
/*
* If SME is active then swiotlb will be set to 1 so that bounce
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2924fd447e61..5046a3c9dec2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -59,7 +59,7 @@
#include <asm/intel_rdt_sched.h>
#include <asm/proto.h>
-void __show_regs(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -85,7 +85,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
- if (!all)
+ if (mode != SHOW_REGS_ALL)
return;
cr0 = read_cr0();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a451bc374b9b..31b4755369f0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -54,15 +54,14 @@
#include <asm/vdso.h>
#include <asm/intel_rdt_sched.h>
#include <asm/unistd.h>
+#include <asm/fsgsbase.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
#endif
-__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
-
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -87,9 +86,17 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
- if (!all)
+ if (mode == SHOW_REGS_SHORT)
return;
+ if (mode == SHOW_REGS_USER) {
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n",
+ fs, shadowgs);
+ return;
+ }
+
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@@ -278,6 +285,138 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+}
+
+static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ unsigned short selector)
+{
+ unsigned short idx = selector >> 3;
+ unsigned long base;
+
+ if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ if (unlikely(idx >= GDT_ENTRIES))
+ return 0;
+
+ /*
+ * There are no user segments in the GDT with nonzero bases
+ * other than the TLS segments.
+ */
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return 0;
+
+ idx -= GDT_ENTRY_TLS_MIN;
+ base = get_desc_base(&task->thread.tls_array[idx]);
+ } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If performance here mattered, we could protect the LDT
+ * with RCU. This is a slow path, though, so we can just
+ * take the mutex.
+ */
+ mutex_lock(&task->mm->context.lock);
+ ldt = task->mm->context.ldt;
+ if (unlikely(idx >= ldt->nr_entries))
+ base = 0;
+ else
+ base = get_desc_base(ldt->entries + idx);
+ mutex_unlock(&task->mm->context.lock);
+#else
+ base = 0;
+#endif
+ }
+
+ return base;
+}
+
+void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+ /*
+ * Set the selector to 0 as a notion, that the segment base is
+ * overwritten, which will be checked for skipping the segment load
+ * during context switch.
+ */
+ loadseg(FS, 0);
+ wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+ /* Set the selector to 0 for the same reason as %fs above. */
+ loadseg(GS, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+}
+
+unsigned long x86_fsbase_read_task(struct task_struct *task)
+{
+ unsigned long fsbase;
+
+ if (task == current)
+ fsbase = x86_fsbase_read_cpu();
+ else if (task->thread.fsindex == 0)
+ fsbase = task->thread.fsbase;
+ else
+ fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
+
+ return fsbase;
+}
+
+unsigned long x86_gsbase_read_task(struct task_struct *task)
+{
+ unsigned long gsbase;
+
+ if (task == current)
+ gsbase = x86_gsbase_read_cpu_inactive();
+ else if (task->thread.gsindex == 0)
+ gsbase = task->thread.gsbase;
+ else
+ gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
+
+ return gsbase;
+}
+
+int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+{
+ /*
+ * Not strictly needed for %fs, but do it for symmetry
+ * with %gs
+ */
+ if (unlikely(fsbase >= TASK_SIZE_MAX))
+ return -EPERM;
+
+ preempt_disable();
+ task->thread.fsbase = fsbase;
+ if (task == current)
+ x86_fsbase_write_cpu(fsbase);
+ task->thread.fsindex = 0;
+ preempt_enable();
+
+ return 0;
+}
+
+int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+{
+ if (unlikely(gsbase >= TASK_SIZE_MAX))
+ return -EPERM;
+
+ preempt_disable();
+ task->thread.gsbase = gsbase;
+ if (task == current)
+ x86_gsbase_write_cpu_inactive(gsbase);
+ task->thread.gsindex = 0;
+ preempt_enable();
+
+ return 0;
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -465,10 +604,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- load_seg_legacy(prev->fsindex, prev->fsbase,
- next->fsindex, next->fsbase, FS);
- load_seg_legacy(prev->gsindex, prev->gsbase,
- next->gsindex, next->gsbase, GS);
+ x86_fsgsbase_load(prev, next);
switch_fpu_finish(next_fpu, cpu);
@@ -619,54 +755,25 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
- int doit = task == current;
- int cpu;
switch (option) {
- case ARCH_SET_GS:
- if (arg2 >= TASK_SIZE_MAX)
- return -EPERM;
- cpu = get_cpu();
- task->thread.gsindex = 0;
- task->thread.gsbase = arg2;
- if (doit) {
- load_gs_index(0);
- ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
- }
- put_cpu();
+ case ARCH_SET_GS: {
+ ret = x86_gsbase_write_task(task, arg2);
break;
- case ARCH_SET_FS:
- /* Not strictly needed for fs, but do it for symmetry
- with gs */
- if (arg2 >= TASK_SIZE_MAX)
- return -EPERM;
- cpu = get_cpu();
- task->thread.fsindex = 0;
- task->thread.fsbase = arg2;
- if (doit) {
- /* set the selector to 0 to not confuse __switch_to */
- loadsegment(fs, 0);
- ret = wrmsrl_safe(MSR_FS_BASE, arg2);
- }
- put_cpu();
+ }
+ case ARCH_SET_FS: {
+ ret = x86_fsbase_write_task(task, arg2);
break;
+ }
case ARCH_GET_FS: {
- unsigned long base;
+ unsigned long base = x86_fsbase_read_task(task);
- if (doit)
- rdmsrl(MSR_FS_BASE, base);
- else
- base = task->thread.fsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
- unsigned long base;
+ unsigned long base = x86_gsbase_read_task(task);
- if (doit)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
- base = task->thread.gsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e2ee403865eb..ffae9b9740fd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -39,6 +39,7 @@
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
#include <asm/syscall.h>
+#include <asm/fsgsbase.h>
#include "tls.h"
@@ -396,12 +397,11 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
/*
- * When changing the segment base, use do_arch_prctl_64
- * to set either thread.fs or thread.fsindex and the
- * corresponding GDT slot.
+ * When changing the FS base, use the same
+ * mechanism as for do_arch_prctl_64().
*/
if (child->thread.fsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ return x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
if (child->thread.gsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ return x86_gsbase_write_task(child, value);
return 0;
#endif
}
@@ -434,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return get_flags(task);
#ifdef CONFIG_X86_64
- case offsetof(struct user_regs_struct, fs_base): {
- /*
- * XXX: This will not behave as expected if called on
- * current or if fsindex != 0.
- */
- return task->thread.fsbase;
- }
- case offsetof(struct user_regs_struct, gs_base): {
- /*
- * XXX: This will not behave as expected if called on
- * current or if fsindex != 0.
- */
- return task->thread.gsbase;
- }
+ case offsetof(struct user_regs_struct, fs_base):
+ return x86_fsbase_read_task(task);
+ case offsetof(struct user_regs_struct, gs_base):
+ return x86_gsbase_read_task(task);
#endif
}
@@ -1369,33 +1359,18 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-static void fill_sigtrap_info(struct task_struct *tsk,
- struct pt_regs *regs,
- int error_code, int si_code,
- struct siginfo *info)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code)
{
tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
- info->si_signo = SIGTRAP;
- info->si_code = si_code;
- info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
-}
-
-void user_single_step_siginfo(struct task_struct *tsk,
- struct pt_regs *regs,
- struct siginfo *info)
-{
- fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
+ /* Send us the fake SIGTRAP */
+ force_sig_fault(SIGTRAP, si_code,
+ user_mode(regs) ? (void __user *)regs->ip : NULL, tsk);
}
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
- int error_code, int si_code)
+void user_single_step_report(struct pt_regs *regs)
{
- struct siginfo info;
-
- clear_siginfo(&info);
- fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
- /* Send us the fake SIGTRAP */
- force_sig_info(SIGTRAP, &info, tsk);
+ send_sigtrap(current, regs, 0, TRAP_BRKPT);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4866badb235..7005f89bf3b2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1251,7 +1251,7 @@ void __init setup_arch(char **cmdline_p)
x86_init.hyper.guest_late_init();
e820__reserve_resources();
- e820__register_nosave_regions(max_low_pfn);
+ e820__register_nosave_regions(max_pfn);
x86_init.resources.reserve_resources();
@@ -1281,6 +1281,23 @@ void __init setup_arch(char **cmdline_p)
unwind_init();
}
+/*
+ * From boot protocol 2.14 onwards we expect the bootloader to set the
+ * version to "0x8000 | <used version>". In case we find a version >= 2.14
+ * without the 0x8000 we assume the boot loader supports 2.13 only and
+ * reset the version accordingly. The 0x8000 flag is removed in any case.
+ */
+void __init x86_verify_bootdata_version(void)
+{
+ if (boot_params.hdr.version & VERSION_WRITTEN)
+ boot_params.hdr.version &= ~VERSION_WRITTEN;
+ else if (boot_params.hdr.version >= 0x020e)
+ boot_params.hdr.version = 0x020d;
+
+ if (boot_params.hdr.version < 0x020e)
+ boot_params.hdr.acpi_rsdp_addr = 0;
+}
+
#ifdef CONFIG_X86_32
static struct resource video_ram_resource = {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f02ecaf97904..5369d7fac797 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -676,6 +676,7 @@ static void __init smp_quirk_init_udelay(void)
/* if modern processor, use no delay */
if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
init_udelay = 0;
return;
@@ -1592,7 +1593,8 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
int i;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
return;
if (!this_cpu_has(X86_FEATURE_MWAIT))
return;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be01328eb755..0e14f6c0d35e 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -10,6 +10,7 @@
*
*/
+#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
@@ -25,7 +26,7 @@
#include <asm/time.h>
#ifdef CONFIG_X86_64
-__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES;
+__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
@@ -105,3 +106,24 @@ void __init time_init(void)
{
late_time_init = x86_late_time_init;
}
+
+/*
+ * Sanity check the vdso related archdata content.
+ */
+void clocksource_arch_init(struct clocksource *cs)
+{
+ if (cs->archdata.vclock_mode == VCLOCK_NONE)
+ return;
+
+ if (cs->archdata.vclock_mode > VCLOCK_MAX) {
+ pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n",
+ cs->name, cs->archdata.vclock_mode);
+ cs->archdata.vclock_mode = VCLOCK_NONE;
+ }
+
+ if (cs->mask != CLOCKSOURCE_MASK(64)) {
+ pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n",
+ cs->name, cs->mask);
+ cs->archdata.vclock_mode = VCLOCK_NONE;
+ }
+}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 12cbe2b88c0f..738bf42b0218 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -111,8 +111,10 @@ int arch_register_cpu(int num)
/*
* Currently CPU0 is only hotpluggable on Intel platforms. Other
* vendors can add hotplug support later.
+ * Xen PV guests don't support CPU0 hotplug at all.
*/
- if (c->x86_vendor != X86_VENDOR_INTEL)
+ if (c->x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_has(X86_FEATURE_XENPV))
cpu0_hotpluggable = 0;
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e6db475164ed..8f6dcd88202e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -189,7 +189,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr)
}
static nokprobe_inline int
-do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
{
if (v8086_mode(regs)) {
@@ -202,11 +202,8 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
error_code, trapnr))
return 0;
}
- return -1;
- }
-
- if (!user_mode(regs)) {
- if (fixup_exception(regs, trapnr))
+ } else if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr, error_code, 0))
return 0;
tsk->thread.error_code = error_code;
@@ -214,49 +211,6 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
die(str, regs, error_code);
}
- return -1;
-}
-
-static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
- siginfo_t *info)
-{
- unsigned long siaddr;
- int sicode;
-
- switch (trapnr) {
- default:
- return SEND_SIG_PRIV;
-
- case X86_TRAP_DE:
- sicode = FPE_INTDIV;
- siaddr = uprobe_get_trap_addr(regs);
- break;
- case X86_TRAP_UD:
- sicode = ILL_ILLOPN;
- siaddr = uprobe_get_trap_addr(regs);
- break;
- case X86_TRAP_AC:
- sicode = BUS_ADRALN;
- siaddr = 0;
- break;
- }
-
- info->si_signo = signr;
- info->si_errno = 0;
- info->si_code = sicode;
- info->si_addr = (void __user *)siaddr;
- return info;
-}
-
-static void
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
-{
- struct task_struct *tsk = current;
-
-
- if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
- return;
/*
* We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
@@ -269,24 +223,45 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
+ return -1;
+}
+
+static void show_signal(struct task_struct *tsk, int signr,
+ const char *type, const char *desc,
+ struct pt_regs *regs, long error_code)
+{
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
- pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
+ pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk), type, desc,
regs->ip, regs->sp, error_code);
print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
+}
- force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, int sicode, void __user *addr)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
+
+ show_signal(tsk, signr, "trap ", str, regs, error_code);
+
+ if (!sicode)
+ force_sig(signr, tsk);
+ else
+ force_sig_fault(signr, sicode, addr, tsk);
}
NOKPROBE_SYMBOL(do_trap);
static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
- unsigned long trapnr, int signr)
+ unsigned long trapnr, int signr, int sicode, void __user *addr)
{
- siginfo_t info;
-
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
/*
@@ -299,26 +274,26 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
- clear_siginfo(&info);
- do_trap(trapnr, signr, str, regs, error_code,
- fill_trap_info(regs, signr, trapnr, &info));
+ do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
}
}
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- do_error_trap(regs, error_code, str, trapnr, signr); \
+#define IP ((void __user *)uprobe_get_trap_addr(regs))
+#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
}
-DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
-DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
-DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
-DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
-DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow)
+DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check)
+#undef IP
#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(const char *message,
@@ -383,6 +358,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
+ *
+ * We will enter general_protection with kernel GSBASE,
+ * which is what the stub expects, given that the faulting
+ * RIP will be the IRET instruction.
*/
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&gpregs->orig_ax;
@@ -455,7 +434,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
const struct mpx_bndcsr *bndcsr;
- siginfo_t *info;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
@@ -493,8 +471,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
goto exit_trap;
break; /* Success, it was handled */
case 1: /* Bound violation. */
- info = mpx_generate_siginfo(regs);
- if (IS_ERR(info)) {
+ {
+ struct task_struct *tsk = current;
+ struct mpx_fault_info mpx;
+
+ if (mpx_fault_info(&mpx, regs)) {
/*
* We failed to decode the MPX instruction. Act as if
* the exception was not caused by MPX.
@@ -503,14 +484,20 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
}
/*
* Success, we decoded the instruction and retrieved
- * an 'info' containing the address being accessed
+ * an 'mpx' containing the address being accessed
* which caused the exception. This information
* allows and application to possibly handle the
* #BR exception itself.
*/
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
- kfree(info);
+ if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs,
+ error_code))
+ break;
+
+ show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code);
+
+ force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper);
break;
+ }
case 0: /* No exception caused by Intel MPX operations. */
goto exit_trap;
default:
@@ -527,12 +514,13 @@ exit_trap:
* up here if the kernel has MPX turned off at compile
* time..
*/
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
+ const char *desc = "general protection fault";
struct task_struct *tsk;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
@@ -551,30 +539,33 @@ do_general_protection(struct pt_regs *regs, long error_code)
tsk = current;
if (!user_mode(regs)) {
- if (fixup_exception(regs, X86_TRAP_GP))
+ if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+
+ /*
+ * To be potentially processing a kprobe fault and to
+ * trust the result from kprobe_running(), we have to
+ * be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, X86_TRAP_GP))
+ return;
+
+ if (notify_die(DIE_GPF, desc, regs, error_code,
X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die("general protection fault", regs, error_code);
+ die(desc, regs, error_code);
return;
}
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, task_pid_nr(tsk),
- regs->ip, regs->sp, error_code);
- print_vma_addr(KERN_CONT " in ", regs->ip);
- pr_cont("\n");
- }
+ show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+ force_sig(SIGSEGV, tsk);
}
NOKPROBE_SYMBOL(do_general_protection);
@@ -617,7 +608,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
goto exit;
cond_local_irq_enable(regs);
- do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
cond_local_irq_disable(regs);
exit:
@@ -831,14 +822,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
struct fpu *fpu = &task->thread.fpu;
- siginfo_t info;
+ int si_code;
char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
"simd exception";
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
- if (fixup_exception(regs, trapnr))
+ if (fixup_exception(regs, trapnr, error_code, 0))
return;
task->thread.error_code = error_code;
@@ -857,18 +848,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
- clear_siginfo(&info);
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-
- info.si_code = fpu__exception_code(fpu, trapnr);
+ si_code = fpu__exception_code(fpu, trapnr);
/* Retry when we get spurious exceptions: */
- if (!info.si_code)
+ if (!si_code)
return;
- force_sig_info(SIGFPE, &info, task);
+ force_sig_fault(SIGFPE, si_code,
+ (void __user *)uprobe_get_trap_addr(regs), task);
}
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
@@ -928,20 +915,13 @@ NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
- siginfo_t info;
-
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
- clear_siginfo(&info);
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_BADSTK;
- info.si_addr = NULL;
if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
- &info);
+ ILL_BADSTK, (void __user *)NULL);
}
}
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1463468ba9a0..e9f777bfed40 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,6 +26,7 @@
#include <asm/apic.h>
#include <asm/intel-family.h>
#include <asm/i8259.h>
+#include <asm/uv/uv.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -57,7 +58,7 @@ struct cyc2ns {
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
{
int seq, idx;
@@ -74,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
{
preempt_enable_notrace();
}
@@ -103,7 +104,7 @@ void cyc2ns_read_end(void)
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data data;
unsigned long long ns;
@@ -246,7 +247,7 @@ unsigned long long sched_clock(void)
bool using_native_sched_clock(void)
{
- return pv_time_ops.sched_clock == native_sched_clock;
+ return pv_ops.time.sched_clock == native_sched_clock;
}
#else
unsigned long long
@@ -635,7 +636,7 @@ unsigned long native_calibrate_tsc(void)
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
- case INTEL_FAM6_ATOM_DENVERTON:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
crystal_khz = 25000; /* 25.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT:
@@ -1415,7 +1416,7 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
static unsigned long __init get_loops_per_jiffy(void)
{
- unsigned long lpj = tsc_khz * KHZ;
+ u64 lpj = (u64)tsc_khz * KHZ;
do_div(lpj, HZ);
return lpj;
@@ -1433,6 +1434,9 @@ void __init tsc_early_init(void)
{
if (!boot_cpu_has(X86_FEATURE_TSC))
return;
+ /* Don't change UV TSC multi-chassis synchronization */
+ if (is_early_uv_system())
+ return;
if (!determine_cpu_tsc_frequencies(true))
return;
loops_per_jiffy = get_loops_per_jiffy();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 27ef714d886c..3d0e9aeea7c8 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -59,12 +59,12 @@ static const struct freq_desc freq_desc_ann = {
};
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
- INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw),
- INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv),
- INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt),
+ INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw),
+ INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng),
INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht),
- INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng),
- INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann),
+ INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann),
{}
};
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index ff20b35e98dd..f8f3cfda01ae 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -271,19 +271,13 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
*/
static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
{
- siginfo_t info;
struct task_struct *tsk = current;
tsk->thread.cr2 = (unsigned long)addr;
tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
tsk->thread.trap_nr = X86_TRAP_PF;
- clear_siginfo(&info);
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = addr;
- force_sig_info(SIGSEGV, &info, tsk);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, tsk);
if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
return;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index deb576b23b7c..843feb94a950 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1086,7 +1086,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
current->pid, regs->sp, regs->ip);
- force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+ force_sig(SIGSEGV, current);
}
return -1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1c03e4aa6474..c2fd39752da8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
- flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
+ flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a419f86..0d618ee634ac 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -65,6 +65,23 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+
#else
#define X86_ALIGN_RODATA_BEGIN
@@ -74,6 +91,7 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
#endif
@@ -118,16 +136,6 @@ SECTIONS
*(.fixup)
*(.gnu.warning)
-#ifdef CONFIG_X86_64
- . = ALIGN(PAGE_SIZE);
- __entry_trampoline_start = .;
- _entry_trampoline = .;
- *(.entry_trampoline)
- . = ALIGN(PAGE_SIZE);
- __entry_trampoline_end = .;
- ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
-#endif
-
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
*(.text.__x86.indirect_thunk)
@@ -355,6 +363,7 @@ SECTIONS
__bss_start = .;
*(.bss..page_aligned)
*(.bss)
+ BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 44685fb2a192..1eae5af491c2 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -26,7 +26,7 @@
#define TOPOLOGY_REGISTER_OFFSET 0x10
-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT_XXL
/*
* Interrupt control on vSMPowered systems:
* ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
@@ -69,17 +69,17 @@ asmlinkage __visible void vsmp_irq_enable(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
-static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init vsmp_patch(u8 type, void *ibuf,
unsigned long addr, unsigned len)
{
switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ case PARAVIRT_PATCH(irq.irq_enable):
+ case PARAVIRT_PATCH(irq.irq_disable):
+ case PARAVIRT_PATCH(irq.save_fl):
+ case PARAVIRT_PATCH(irq.restore_fl):
+ return paravirt_patch_default(type, ibuf, addr, len);
default:
- return native_patch(type, clobbers, ibuf, addr, len);
+ return native_patch(type, ibuf, addr, len);
}
}
@@ -111,11 +111,11 @@ static void __init set_vsmp_pv_ops(void)
if (cap & ctl & (1 << 4)) {
/* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
- pv_init_ops.patch = vsmp_patch;
+ pv_ops.irq.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+ pv_ops.irq.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
+ pv_ops.irq.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
+ pv_ops.irq.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
+ pv_ops.init.patch = vsmp_patch;
ctl &= ~(1 << 4);
}
writel(ctl, address + 4);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 2792b5573818..50a2b492fdd6 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -31,7 +31,6 @@ static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
static bool __init bool_x86_init_noop(void) { return false; }
static void x86_op_int_noop(int cpu) { }
-static u64 u64_x86_init_noop(void) { return 0; }
/*
* The platform setup functions are preset with the default functions
@@ -96,7 +95,7 @@ struct x86_init_ops x86_init __initdata = {
},
.acpi = {
- .get_root_pointer = u64_x86_init_noop,
+ .get_root_pointer = x86_default_get_root_pointer,
.reduced_hw_early_init = acpi_generic_reduced_hw_init,
},
};
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106482da6388..34edf198708f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2711,7 +2711,16 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
return true;
- /* default: (not Intel, not AMD), apply Intel's stricter rules... */
+ /* Hygon ("HygonGenuine") */
+ if (ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx)
+ return true;
+
+ /*
+ * default: (not Intel, not AMD, not Hygon), apply Intel's
+ * stricter rules...
+ */
return false;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0cefba28c864..fbb0e6df121b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -548,7 +548,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
}
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
- unsigned long ipi_bitmap_high, int min,
+ unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
int i;
@@ -571,18 +571,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
+ if (min > map->max_apic_id)
+ goto out;
/* Bits above cluster_size are masked in the caller. */
- for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ for_each_set_bit(i, &ipi_bitmap_low,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
}
min += cluster_size;
- for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
+
+ if (min > map->max_apic_id)
+ goto out;
+
+ for_each_set_bit(i, &ipi_bitmap_high,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
}
+out:
rcu_read_unlock();
return count;
}
@@ -1331,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
- return kvm_apic_hw_enabled(apic) &&
- addr >= apic->base_address &&
- addr < apic->base_address + LAPIC_MMIO_LENGTH;
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
@@ -1345,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
if (!apic_mmio_in_range(apic, address))
return -EOPNOTSUPP;
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
kvm_lapic_reg_read(apic, offset, len, data);
return 0;
@@ -1904,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
if (!apic_mmio_in_range(apic, address))
return -EOPNOTSUPP;
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
/*
* APIC register must be aligned on 128-bits boundary.
* 32/64/128 bits registers must be accessed thru 32 bits.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a282321329b5..e843ec46609d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*/
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+
static void mmu_spte_set(u64 *sptep, u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
@@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
- shadow_nonpresent_or_rsvd_mask;
- u64 gpa = spte & ~mask;
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
& shadow_nonpresent_or_rsvd_mask;
@@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static void kvm_mmu_reset_all_pte_masks(void)
{
+ u8 low_phys_bits;
+
shadow_user_mask = 0;
shadow_accessed_mask = 0;
shadow_dirty_mask = 0;
@@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void)
* appropriate mask to guard against L1TF attacks. Otherwise, it is
* assumed that the CPU is not vulnerable to L1TF.
*/
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
if (boot_cpu_data.x86_phys_bits <
- 52 - shadow_nonpresent_or_rsvd_mask_len)
+ 52 - shadow_nonpresent_or_rsvd_mask_len) {
shadow_nonpresent_or_rsvd_mask =
rsvd_bits(boot_cpu_data.x86_phys_bits -
shadow_nonpresent_or_rsvd_mask_len,
boot_cpu_data.x86_phys_bits - 1);
+ low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
+ }
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
}
static int is_cpuid_PSE36(void)
@@ -899,7 +915,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
/*
* Make sure the write to vcpu->mode is not reordered in front of
- * reads to sptes. If it does, kvm_commit_zap_page() can see us
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
@@ -1853,11 +1869,6 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
-}
-
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
@@ -3114,16 +3125,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
{
- siginfo_t info;
-
- clear_siginfo(&info);
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_MCEERR_AR;
- info.si_addr = (void __user *)address;
- info.si_addr_lsb = PAGE_SHIFT;
-
- send_sig_info(SIGBUS, &info, tsk);
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
}
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
@@ -5217,7 +5219,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len)
{
- int r, emulation_type = EMULTYPE_RETRY;
+ int r, emulation_type = 0;
enum emulation_result er;
bool direct = vcpu->arch.mmu.direct_map;
@@ -5230,10 +5232,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_PF_EMULATE) {
- emulation_type = 0;
+ if (r == RET_PF_EMULATE)
goto emulate;
- }
}
if (r == RET_PF_INVALID) {
@@ -5260,8 +5260,19 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
return 1;
}
- if (mmio_info_in_cache(vcpu, cr2, direct))
- emulation_type = 0;
+ /*
+ * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
+ * optimistically try to just unprotect the page and let the processor
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying MMIO emulation, as it's not only pointless but could also
+ * cause us to enter an infinite loop because the processor will keep
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+ */
+ if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+ emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
* On AMD platforms, under certain conditions insn_len may be zero on #NPF.
@@ -5413,7 +5424,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
- kvm_init_mmu(vcpu, true);
+ /*
+ * kvm_mmu_setup() is called only on vCPU initialization.
+ * Therefore, no need to reset mmu roots as they are not yet
+ * initialized.
+ */
+ kvm_init_mmu(vcpu, false);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6276140044d0..61ccfb13899e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -436,14 +436,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
static inline bool svm_sev_enabled(void)
{
- return max_sev_asid;
+ return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
}
static inline bool sev_guest(struct kvm *kvm)
{
+#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->active;
+#else
+ return false;
+#endif
}
static inline int sev_get_asid(struct kvm *kvm)
@@ -776,7 +780,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
+ if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -1226,8 +1230,7 @@ static __init int sev_hardware_setup(void)
min_sev_asid = cpuid_edx(0x8000001F);
/* Initialize SEV ASID bitmap */
- sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid),
- sizeof(unsigned long), GFP_KERNEL);
+ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_asid_bitmap)
return 1;
@@ -1405,7 +1408,7 @@ static __exit void svm_hardware_unsetup(void)
int cpu;
if (svm_sev_enabled())
- kfree(sev_asid_bitmap);
+ bitmap_free(sev_asid_bitmap);
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -2715,7 +2718,7 @@ static int gp_interception(struct vcpu_svm *svm)
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = emulate_instruction(vcpu,
+ er = kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
if (er == EMULATE_USER_EXIT)
return 0;
@@ -2819,7 +2822,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -3861,7 +3864,7 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3869,13 +3872,13 @@ static int invlpg_interception(struct vcpu_svm *svm)
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
}
static int rsm_interception(struct vcpu_svm *svm)
{
- return x86_emulate_instruction(&svm->vcpu, 0, 0,
- rsm_ins_bytes, 2) == EMULATE_DONE;
+ return kvm_emulate_instruction_from_buffer(&svm->vcpu,
+ rsm_ins_bytes, 2) == EMULATE_DONE;
}
static int rdpmc_interception(struct vcpu_svm *svm)
@@ -4700,7 +4703,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
}
return ret;
@@ -6747,7 +6750,7 @@ e_free:
static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
{
unsigned long vaddr, vaddr_end, next_vaddr;
- unsigned long dst_vaddr, dst_vaddr_end;
+ unsigned long dst_vaddr;
struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
unsigned long n;
@@ -6763,7 +6766,6 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
size = debug.len;
vaddr_end = vaddr + size;
dst_vaddr = debug.dst_uaddr;
- dst_vaddr_end = dst_vaddr + size;
for (; vaddr < vaddr_end; vaddr = next_vaddr) {
int len, s_off, d_off;
@@ -7150,6 +7152,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.check_intercept = svm_check_intercept,
.handle_external_intr = svm_handle_external_intr,
+ .request_immediate_exit = __kvm_request_immediate_exit,
+
.sched_in = svm_sched_in,
.pmu_ops = &amd_pmu_ops,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1d26f3c4985b..e665aa7167cf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
-#define MSR_BITMAP_MODE_LM 4
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
@@ -397,6 +396,7 @@ struct loaded_vmcs {
int cpu;
bool launched;
bool nmi_known_unmasked;
+ bool hv_timer_armed;
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
@@ -856,6 +856,7 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u64 vmcs01_guest_bndcfgs;
u16 vpid02;
u16 last_vpid;
@@ -1019,6 +1020,8 @@ struct vcpu_vmx {
int ple_window;
bool ple_window_dirty;
+ bool req_immediate_exit;
+
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
@@ -1569,8 +1572,12 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
goto out;
}
+ /*
+ * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
+ * base of EPT PML4 table, strip off EPT configuration information.
+ */
ret = hyperv_flush_guest_mapping(
- to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
out:
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
@@ -2864,6 +2871,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
+ vmx->req_immediate_exit = false;
+
if (vmx->loaded_cpu_state)
return;
@@ -2894,8 +2903,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- if (is_long_mode(&vmx->vcpu))
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -2946,8 +2954,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
vmx->loaded_cpu_state = NULL;
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -2975,24 +2982,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- rdmsrl(MSR_KERNEL_GS_BASE,
- vmx->msr_guest_kernel_gs_base);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
@@ -3528,9 +3530,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
- if (kvm_mpx_supported())
- msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
@@ -3547,8 +3546,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
VM_ENTRY_LOAD_IA32_PAT;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- if (kvm_mpx_supported())
- msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3596,12 +3593,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
msrs->secondary_ctls_high);
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING;
+
/*
* We can emulate "VMCS shadowing," even if the hardware
* doesn't support it.
@@ -3658,6 +3655,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
@@ -5068,19 +5069,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!msr)
return;
- /*
- * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
- * 64-bit mode as a 64-bit kernel may frequently access the
- * MSR. This means we need to manually save/restore the MSR
- * when switching between guest and host state, but only if
- * the guest is in 64-bit mode. Sync our cached value if the
- * guest is transitioning to 32-bit mode and the CPU contains
- * guest state, i.e. the cache is stale.
- */
-#ifdef CONFIG_X86_64
- if (!(efer & EFER_LMA))
- (void)vmx_read_guest_kernel_gs_base(vmx);
-#endif
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -5393,9 +5381,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* To use VMXON (and later other VMX instructions), a guest
* must first be able to turn on cr4.VMXE (see handle_vmon()).
* So basically the check on whether to allow nested VMX
- * is here.
+ * is here. We operate under the default treatment of SMM,
+ * so VMX cannot be enabled under SMM.
*/
- if (!nested_vmx_allowed(vcpu))
+ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
return 1;
}
@@ -6072,9 +6061,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
}
- if (is_long_mode(vcpu))
- mode |= MSR_BITMAP_MODE_LM;
-
return mode;
}
@@ -6115,9 +6101,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
if (!changed)
return;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
- !(mode & MSR_BITMAP_MODE_LM));
-
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
@@ -6183,6 +6166,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
nested_mark_vmcs12_pages_dirty(vcpu);
}
+static u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ kunmap(vmx->nested.virtual_apic_page);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
@@ -6983,7 +6992,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
return kvm_vcpu_halt(vcpu);
@@ -7054,7 +7063,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = emulate_instruction(vcpu,
+ er = kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
if (er == EMULATE_USER_EXIT)
return 0;
@@ -7157,7 +7166,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -7231,7 +7240,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_cr(struct kvm_vcpu *vcpu)
@@ -7480,7 +7489,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -7547,7 +7556,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
}
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -7704,8 +7713,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
return kvm_skip_emulated_instruction(vcpu);
else
- return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
- NULL, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
}
return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -7748,7 +7757,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
- err = emulate_instruction(vcpu, 0);
+ err = kvm_emulate_instruction(vcpu, 0);
if (err == EMULATE_USER_EXIT) {
++vcpu->stat.mmio_exits;
@@ -7966,6 +7975,9 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (!cpu_has_vmx_preemption_timer())
+ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
u64 vmx_msr;
@@ -9208,7 +9220,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- kvm_lapic_expired_hv_timer(vcpu);
+ if (!to_vmx(vcpu)->req_immediate_exit)
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -10214,15 +10227,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- if (!cpu_need_tpr_shadow(vcpu))
- return;
-
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -10344,6 +10358,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -10595,24 +10617,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+ if (!vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->hv_deadline_tsc == -1)
+ if (vmx->req_immediate_exit) {
+ vmx_arm_hv_timer(vmx, 0);
return;
+ }
- tscl = rdtsc();
- if (vmx->hv_deadline_tsc > tscl)
- /* sure to be 32 bit only because checked on set_hv_timer */
- delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
- cpu_preemption_timer_multi);
- else
- delta_tsc = 0;
+ if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx_arm_hv_timer(vmx, delta_tsc);
+ return;
+ }
+
+ if (vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = false;
}
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -10672,7 +10713,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
- vmx_arm_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -11214,6 +11255,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
#undef cr4_fixed1_update
}
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_mpx_supported()) {
+ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+ if (mpx_enabled) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+ }
+ }
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11230,8 +11288,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (nested_vmx_allowed(vcpu))
+ if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
+ nested_vmx_entry_exit_ctls_update(vcpu);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -11427,16 +11487,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vcpu->arch.virtual_tsc_khz == 0)
- return;
-
- /* Make sure short timeouts reliably trigger an immediate vmexit.
- * hrtimer_start does not guarantee this. */
- if (preemption_timeout <= 1) {
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
return;
}
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
preemption_timeout *= 1000000;
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
@@ -11646,11 +11708,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
* bits 15:8 should be zero in posted_intr_nv,
* the descriptor address has been already checked
* in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
*/
if (nested_cpu_has_posted_intr(vmcs12) &&
(!nested_cpu_has_vid(vmcs12) ||
!nested_exit_intr_ack_set(vcpu) ||
- vmcs12->posted_intr_nv & 0xff00))
+ (vmcs12->posted_intr_nv & 0xff00) ||
+ (vmcs12->posted_intr_desc_addr & 0x3f) ||
+ (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
@@ -11993,8 +12059,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
set_cr4_guest_host_mask(vmx);
- if (vmx_mpx_supported())
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ if (kvm_mpx_supported()) {
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ else
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ }
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12076,11 +12147,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control = vmcs12->pin_based_vm_exec_control;
- /* Preemption timer setting is only taken from vmcs01. */
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Preemption timer setting is computed directly in vmx_vcpu_run. */
exec_control |= vmcs_config.pin_based_exec_ctrl;
- if (vmx->hv_deadline_tsc == -1)
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmx->loaded_vmcs->hv_timer_armed = false;
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -12318,6 +12388,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -12537,12 +12610,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
bool from_vmentry = !!exit_qual;
u32 dummy_exit_qual;
+ bool evaluate_pending_interrupts;
int r = 0;
+ evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
@@ -12575,6 +12657,23 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
}
/*
+ * If L1 had a pending IRQ/NMI until it executed
+ * VMLAUNCH/VMRESUME which wasn't delivered because it was
+ * disallowed (e.g. interrupts disabled), L0 needs to
+ * evaluate if this pending event should cause an exit from L2
+ * to L1 or delivered directly to L2 (e.g. In case L1 don't
+ * intercept EXTERNAL_INTERRUPT).
+ *
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request, or checking RVI during evaluation of
+ * pending virtual interrupts. However, this setting was done
+ * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+ * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ */
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
@@ -12841,6 +12940,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
return 0;
}
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
{
ktime_t remaining =
@@ -13231,12 +13335,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (vmx->hv_deadline_tsc == -1)
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
- else
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -13440,18 +13539,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
return -ERANGE;
vmx->hv_deadline_tsc = tscl + delta_tsc;
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
-
return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->hv_deadline_tsc = -1;
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
@@ -13932,6 +14025,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
@@ -13988,9 +14089,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
return -EINVAL;
- if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
- vmx->nested.nested_run_pending = 1;
-
vmx->nested.dirty_vmcs12 = true;
ret = enter_vmx_non_root_mode(vcpu, NULL);
if (ret)
@@ -14078,6 +14176,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.apicv_post_state_restore = vmx_apicv_post_state_restore,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
@@ -14111,6 +14210,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.umip_emulated = vmx_umip_emulated,
.check_nested_events = vmx_check_nested_events,
+ .request_immediate_exit = vmx_request_immediate_exit,
.sched_in = vmx_sched_in,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 506bd2b4b8bb..ca717737347e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
gfn_t gfn;
int r;
- if (is_long_mode(vcpu) || !is_pae(vcpu))
+ if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
@@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_PLATFORM_INFO:
if (!msr_info->host_initiated ||
- data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
(!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
cpuid_fault_enabled(vcpu)))
return 1;
@@ -2780,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.osvw.status;
break;
case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
msr_info->data = vcpu->arch.msr_platform_info;
break;
case MSR_MISC_FEATURES_ENABLES:
@@ -2927,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_GET_MSR_FEATURES:
+ case KVM_CAP_MSR_PLATFORM_INFO:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -4007,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ r = -EFAULT;
if (get_user(user_data_size, &user_kvm_nested_state->size))
- return -EFAULT;
+ break;
r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
user_data_size);
if (r < 0)
- return r;
+ break;
if (r > user_data_size) {
if (put_user(r, &user_kvm_nested_state->size))
- return -EFAULT;
- return -E2BIG;
+ r = -EFAULT;
+ else
+ r = -E2BIG;
+ break;
}
+
r = 0;
break;
}
@@ -4031,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!kvm_x86_ops->set_nested_state)
break;
+ r = -EFAULT;
if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
- return -EFAULT;
+ break;
+ r = -EINVAL;
if (kvm_state.size < sizeof(kvm_state))
- return -EINVAL;
+ break;
if (kvm_state.flags &
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
- return -EINVAL;
+ break;
/* nested_run_pending implies guest_mode. */
if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
- return -EINVAL;
+ break;
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
break;
@@ -4350,6 +4359,10 @@ split_irqchip_unlock:
kvm->arch.pause_in_guest = true;
r = 0;
break;
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -4685,7 +4698,7 @@ static void kvm_init_msr_list(void)
*/
switch (msrs_to_save[i]) {
case MSR_IA32_BNDCFGS:
- if (!kvm_x86_ops->mpx_supported())
+ if (!kvm_mpx_supported())
continue;
break;
case MSR_TSC_AUX:
@@ -4987,7 +5000,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
emul_type = 0;
}
- er = emulate_instruction(vcpu, emul_type);
+ er = kvm_emulate_instruction(vcpu, emul_type);
if (er == EMULATE_USER_EXIT)
return 0;
if (er != EMULATE_DONE)
@@ -5870,7 +5883,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
gpa_t gpa = cr2;
kvm_pfn_t pfn;
- if (emulation_type & EMULTYPE_NO_REEXECUTE)
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return false;
if (!vcpu->arch.mmu.direct_map) {
@@ -5958,7 +5974,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
*/
vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
- if (!(emulation_type & EMULTYPE_RETRY))
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return false;
if (x86_page_table_writing_insn(ctxt))
@@ -6276,7 +6295,19 @@ restart:
return r;
}
-EXPORT_SYMBOL_GPL(x86_emulate_instruction);
+
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
+
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len)
+{
+ return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
@@ -7343,6 +7374,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ smp_send_reschedule(vcpu->cpu);
+}
+EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -7547,7 +7584,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- smp_send_reschedule(vcpu->cpu);
+ kvm_x86_ops->request_immediate_exit(vcpu);
}
trace_kvm_entry(vcpu->vcpu_id);
@@ -7734,7 +7771,7 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
int r;
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r != EMULATE_DONE)
return 0;
@@ -7811,6 +7848,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
+ preempt_enable();
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ preempt_enable();
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -8159,7 +8219,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -8388,29 +8448,6 @@ static void fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= X86_CR0_ET;
}
-/* Swap (qemu) user FPU context for the guest FPU context. */
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
- /* PKRU is separately restored in kvm_x86_ops->run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
- ~XFEATURE_MASK_PKRU);
- preempt_enable();
- trace_kvm_fpu(1);
-}
-
-/* When vcpu_run ends, restore user space FPU context. */
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
- preempt_enable();
- ++vcpu->stat.fpu_reload;
- trace_kvm_fpu(0);
-}
-
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
@@ -8834,6 +8871,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
pvclock_update_vm_gtod_copy(kvm);
+ kvm->arch.guest_can_read_msr_platform_info = true;
+
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
@@ -9182,6 +9221,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
kvm_page_track_flush_slot(kvm, slot);
}
+static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return (is_guest_mode(vcpu) &&
+ kvm_x86_ops->guest_apic_has_interrupt &&
+ kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+}
+
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
@@ -9206,7 +9252,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_has_interrupt(vcpu))
+ (kvm_cpu_has_interrupt(vcpu) ||
+ kvm_guest_apic_has_interrupt(vcpu)))
return true;
if (kvm_hv_has_stimer_pending(vcpu))
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 257f27620bc2..67b9568613f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -274,6 +274,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+ int emulation_type, void *insn, int insn_len);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 46e71a74e612..ad8e0906d1ea 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -273,11 +273,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
#define SRC(y...) \
9999: y; \
- _ASM_EXTABLE(9999b, 6001f)
+ _ASM_EXTABLE_UA(9999b, 6001f)
#define DST(y...) \
9999: y; \
- _ASM_EXTABLE(9999b, 6002f)
+ _ASM_EXTABLE_UA(9999b, 6002f)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 020f75cc8cf6..db4e5aa0858b 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -92,26 +92,26 @@ ENTRY(copy_user_generic_unrolled)
60: jmp copy_user_handle_tail /* ecx is zerorest also */
.previous
- _ASM_EXTABLE(1b,30b)
- _ASM_EXTABLE(2b,30b)
- _ASM_EXTABLE(3b,30b)
- _ASM_EXTABLE(4b,30b)
- _ASM_EXTABLE(5b,30b)
- _ASM_EXTABLE(6b,30b)
- _ASM_EXTABLE(7b,30b)
- _ASM_EXTABLE(8b,30b)
- _ASM_EXTABLE(9b,30b)
- _ASM_EXTABLE(10b,30b)
- _ASM_EXTABLE(11b,30b)
- _ASM_EXTABLE(12b,30b)
- _ASM_EXTABLE(13b,30b)
- _ASM_EXTABLE(14b,30b)
- _ASM_EXTABLE(15b,30b)
- _ASM_EXTABLE(16b,30b)
- _ASM_EXTABLE(18b,40b)
- _ASM_EXTABLE(19b,40b)
- _ASM_EXTABLE(21b,50b)
- _ASM_EXTABLE(22b,50b)
+ _ASM_EXTABLE_UA(1b, 30b)
+ _ASM_EXTABLE_UA(2b, 30b)
+ _ASM_EXTABLE_UA(3b, 30b)
+ _ASM_EXTABLE_UA(4b, 30b)
+ _ASM_EXTABLE_UA(5b, 30b)
+ _ASM_EXTABLE_UA(6b, 30b)
+ _ASM_EXTABLE_UA(7b, 30b)
+ _ASM_EXTABLE_UA(8b, 30b)
+ _ASM_EXTABLE_UA(9b, 30b)
+ _ASM_EXTABLE_UA(10b, 30b)
+ _ASM_EXTABLE_UA(11b, 30b)
+ _ASM_EXTABLE_UA(12b, 30b)
+ _ASM_EXTABLE_UA(13b, 30b)
+ _ASM_EXTABLE_UA(14b, 30b)
+ _ASM_EXTABLE_UA(15b, 30b)
+ _ASM_EXTABLE_UA(16b, 30b)
+ _ASM_EXTABLE_UA(18b, 40b)
+ _ASM_EXTABLE_UA(19b, 40b)
+ _ASM_EXTABLE_UA(21b, 50b)
+ _ASM_EXTABLE_UA(22b, 50b)
ENDPROC(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
@@ -156,8 +156,8 @@ ENTRY(copy_user_generic_string)
jmp copy_user_handle_tail
.previous
- _ASM_EXTABLE(1b,11b)
- _ASM_EXTABLE(3b,12b)
+ _ASM_EXTABLE_UA(1b, 11b)
+ _ASM_EXTABLE_UA(3b, 12b)
ENDPROC(copy_user_generic_string)
EXPORT_SYMBOL(copy_user_generic_string)
@@ -189,7 +189,7 @@ ENTRY(copy_user_enhanced_fast_string)
jmp copy_user_handle_tail
.previous
- _ASM_EXTABLE(1b,12b)
+ _ASM_EXTABLE_UA(1b, 12b)
ENDPROC(copy_user_enhanced_fast_string)
EXPORT_SYMBOL(copy_user_enhanced_fast_string)
@@ -319,27 +319,27 @@ ENTRY(__copy_user_nocache)
jmp copy_user_handle_tail
.previous
- _ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
- _ASM_EXTABLE(20b,.L_fixup_8b_copy)
- _ASM_EXTABLE(21b,.L_fixup_8b_copy)
- _ASM_EXTABLE(30b,.L_fixup_4b_copy)
- _ASM_EXTABLE(31b,.L_fixup_4b_copy)
- _ASM_EXTABLE(40b,.L_fixup_1b_copy)
- _ASM_EXTABLE(41b,.L_fixup_1b_copy)
+ _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy)
+ _ASM_EXTABLE_UA(20b, .L_fixup_8b_copy)
+ _ASM_EXTABLE_UA(21b, .L_fixup_8b_copy)
+ _ASM_EXTABLE_UA(30b, .L_fixup_4b_copy)
+ _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy)
+ _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy)
+ _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy)
ENDPROC(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 45a53dfe1859..a4a379e79259 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -31,14 +31,18 @@
.macro source
10:
- _ASM_EXTABLE(10b, .Lbad_source)
+ _ASM_EXTABLE_UA(10b, .Lbad_source)
.endm
.macro dest
20:
- _ASM_EXTABLE(20b, .Lbad_dest)
+ _ASM_EXTABLE_UA(20b, .Lbad_dest)
.endm
+ /*
+ * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
+ * potentially unmapped kernel address.
+ */
.macro ignore L=.Lignore
30:
_ASM_EXTABLE(30b, \L)
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 49b167f73215..74fdff968ea3 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -132,12 +132,12 @@ bad_get_user_8:
END(bad_get_user_8)
#endif
- _ASM_EXTABLE(1b,bad_get_user)
- _ASM_EXTABLE(2b,bad_get_user)
- _ASM_EXTABLE(3b,bad_get_user)
+ _ASM_EXTABLE_UA(1b, bad_get_user)
+ _ASM_EXTABLE_UA(2b, bad_get_user)
+ _ASM_EXTABLE_UA(3b, bad_get_user)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE(4b,bad_get_user)
+ _ASM_EXTABLE_UA(4b, bad_get_user)
#else
- _ASM_EXTABLE(4b,bad_get_user_8)
- _ASM_EXTABLE(5b,bad_get_user_8)
+ _ASM_EXTABLE_UA(4b, bad_get_user_8)
+ _ASM_EXTABLE_UA(5b, bad_get_user_8)
#endif
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 96dce5fe2a35..d2e5c9c39601 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -94,10 +94,10 @@ bad_put_user:
EXIT
END(bad_put_user)
- _ASM_EXTABLE(1b,bad_put_user)
- _ASM_EXTABLE(2b,bad_put_user)
- _ASM_EXTABLE(3b,bad_put_user)
- _ASM_EXTABLE(4b,bad_put_user)
+ _ASM_EXTABLE_UA(1b, bad_put_user)
+ _ASM_EXTABLE_UA(2b, bad_put_user)
+ _ASM_EXTABLE_UA(3b, bad_put_user)
+ _ASM_EXTABLE_UA(4b, bad_put_user)
#ifdef CONFIG_X86_32
- _ASM_EXTABLE(5b,bad_put_user)
+ _ASM_EXTABLE_UA(5b, bad_put_user)
#endif
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <asm/tlbflush.h>
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
if (__range_not_ok(from, n, TASK_SIZE))
return n;
+ if (!nmi_uaccess_okay())
+ return n;
+
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7add8ba06887..71fb58d44d58 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -47,8 +47,8 @@ do { \
"3: lea 0(%2,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(0b,3b) \
- _ASM_EXTABLE(1b,2b) \
+ _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0) \
: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
} while (0)
@@ -153,44 +153,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
"101: lea 0(%%eax,%0,4),%0\n"
" jmp 100b\n"
".previous\n"
- _ASM_EXTABLE(1b,100b)
- _ASM_EXTABLE(2b,100b)
- _ASM_EXTABLE(3b,100b)
- _ASM_EXTABLE(4b,100b)
- _ASM_EXTABLE(5b,100b)
- _ASM_EXTABLE(6b,100b)
- _ASM_EXTABLE(7b,100b)
- _ASM_EXTABLE(8b,100b)
- _ASM_EXTABLE(9b,100b)
- _ASM_EXTABLE(10b,100b)
- _ASM_EXTABLE(11b,100b)
- _ASM_EXTABLE(12b,100b)
- _ASM_EXTABLE(13b,100b)
- _ASM_EXTABLE(14b,100b)
- _ASM_EXTABLE(15b,100b)
- _ASM_EXTABLE(16b,100b)
- _ASM_EXTABLE(17b,100b)
- _ASM_EXTABLE(18b,100b)
- _ASM_EXTABLE(19b,100b)
- _ASM_EXTABLE(20b,100b)
- _ASM_EXTABLE(21b,100b)
- _ASM_EXTABLE(22b,100b)
- _ASM_EXTABLE(23b,100b)
- _ASM_EXTABLE(24b,100b)
- _ASM_EXTABLE(25b,100b)
- _ASM_EXTABLE(26b,100b)
- _ASM_EXTABLE(27b,100b)
- _ASM_EXTABLE(28b,100b)
- _ASM_EXTABLE(29b,100b)
- _ASM_EXTABLE(30b,100b)
- _ASM_EXTABLE(31b,100b)
- _ASM_EXTABLE(32b,100b)
- _ASM_EXTABLE(33b,100b)
- _ASM_EXTABLE(34b,100b)
- _ASM_EXTABLE(35b,100b)
- _ASM_EXTABLE(36b,100b)
- _ASM_EXTABLE(37b,100b)
- _ASM_EXTABLE(99b,101b)
+ _ASM_EXTABLE_UA(1b, 100b)
+ _ASM_EXTABLE_UA(2b, 100b)
+ _ASM_EXTABLE_UA(3b, 100b)
+ _ASM_EXTABLE_UA(4b, 100b)
+ _ASM_EXTABLE_UA(5b, 100b)
+ _ASM_EXTABLE_UA(6b, 100b)
+ _ASM_EXTABLE_UA(7b, 100b)
+ _ASM_EXTABLE_UA(8b, 100b)
+ _ASM_EXTABLE_UA(9b, 100b)
+ _ASM_EXTABLE_UA(10b, 100b)
+ _ASM_EXTABLE_UA(11b, 100b)
+ _ASM_EXTABLE_UA(12b, 100b)
+ _ASM_EXTABLE_UA(13b, 100b)
+ _ASM_EXTABLE_UA(14b, 100b)
+ _ASM_EXTABLE_UA(15b, 100b)
+ _ASM_EXTABLE_UA(16b, 100b)
+ _ASM_EXTABLE_UA(17b, 100b)
+ _ASM_EXTABLE_UA(18b, 100b)
+ _ASM_EXTABLE_UA(19b, 100b)
+ _ASM_EXTABLE_UA(20b, 100b)
+ _ASM_EXTABLE_UA(21b, 100b)
+ _ASM_EXTABLE_UA(22b, 100b)
+ _ASM_EXTABLE_UA(23b, 100b)
+ _ASM_EXTABLE_UA(24b, 100b)
+ _ASM_EXTABLE_UA(25b, 100b)
+ _ASM_EXTABLE_UA(26b, 100b)
+ _ASM_EXTABLE_UA(27b, 100b)
+ _ASM_EXTABLE_UA(28b, 100b)
+ _ASM_EXTABLE_UA(29b, 100b)
+ _ASM_EXTABLE_UA(30b, 100b)
+ _ASM_EXTABLE_UA(31b, 100b)
+ _ASM_EXTABLE_UA(32b, 100b)
+ _ASM_EXTABLE_UA(33b, 100b)
+ _ASM_EXTABLE_UA(34b, 100b)
+ _ASM_EXTABLE_UA(35b, 100b)
+ _ASM_EXTABLE_UA(36b, 100b)
+ _ASM_EXTABLE_UA(37b, 100b)
+ _ASM_EXTABLE_UA(99b, 101b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -259,26 +259,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
"9: lea 0(%%eax,%0,4),%0\n"
"16: jmp 8b\n"
".previous\n"
- _ASM_EXTABLE(0b,16b)
- _ASM_EXTABLE(1b,16b)
- _ASM_EXTABLE(2b,16b)
- _ASM_EXTABLE(21b,16b)
- _ASM_EXTABLE(3b,16b)
- _ASM_EXTABLE(31b,16b)
- _ASM_EXTABLE(4b,16b)
- _ASM_EXTABLE(41b,16b)
- _ASM_EXTABLE(10b,16b)
- _ASM_EXTABLE(51b,16b)
- _ASM_EXTABLE(11b,16b)
- _ASM_EXTABLE(61b,16b)
- _ASM_EXTABLE(12b,16b)
- _ASM_EXTABLE(71b,16b)
- _ASM_EXTABLE(13b,16b)
- _ASM_EXTABLE(81b,16b)
- _ASM_EXTABLE(14b,16b)
- _ASM_EXTABLE(91b,16b)
- _ASM_EXTABLE(6b,9b)
- _ASM_EXTABLE(7b,16b)
+ _ASM_EXTABLE_UA(0b, 16b)
+ _ASM_EXTABLE_UA(1b, 16b)
+ _ASM_EXTABLE_UA(2b, 16b)
+ _ASM_EXTABLE_UA(21b, 16b)
+ _ASM_EXTABLE_UA(3b, 16b)
+ _ASM_EXTABLE_UA(31b, 16b)
+ _ASM_EXTABLE_UA(4b, 16b)
+ _ASM_EXTABLE_UA(41b, 16b)
+ _ASM_EXTABLE_UA(10b, 16b)
+ _ASM_EXTABLE_UA(51b, 16b)
+ _ASM_EXTABLE_UA(11b, 16b)
+ _ASM_EXTABLE_UA(61b, 16b)
+ _ASM_EXTABLE_UA(12b, 16b)
+ _ASM_EXTABLE_UA(71b, 16b)
+ _ASM_EXTABLE_UA(13b, 16b)
+ _ASM_EXTABLE_UA(81b, 16b)
+ _ASM_EXTABLE_UA(14b, 16b)
+ _ASM_EXTABLE_UA(91b, 16b)
+ _ASM_EXTABLE_UA(6b, 9b)
+ _ASM_EXTABLE_UA(7b, 16b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -321,9 +321,9 @@ do { \
"3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(4b,5b) \
- _ASM_EXTABLE(0b,3b) \
- _ASM_EXTABLE(1b,2b) \
+ _ASM_EXTABLE_UA(4b, 5b) \
+ _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 9c5606d88f61..1bd837cdc4b1 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -37,8 +37,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
"3: lea 0(%[size1],%[size8],8),%[size8]\n"
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(0b,3b)
- _ASM_EXTABLE(1b,2b)
+ _ASM_EXTABLE_UA(0b, 3b)
+ _ASM_EXTABLE_UA(1b, 2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
clac();
@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
return rc;
}
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 076ebdce9bd4..12d7e7fb4efd 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -15,7 +15,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage)
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline);
#endif
struct cpu_entry_area *get_cpu_entry_area(int cpu)
@@ -83,8 +82,6 @@ static void percpu_setup_debug_store(int cpu)
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
- extern char _entry_trampoline[];
-
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
@@ -146,43 +143,10 @@ static void __init setup_cpu_entry_area(int cpu)
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
-
- cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
- /*
- * The cpu_entry_area alias addresses are not in the kernel binary
- * so they do not show up in /proc/kcore normally. This adds entries
- * for them manually.
- */
- kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu),
- _entry_trampoline,
- &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE);
#endif
percpu_setup_debug_store(cpu);
}
-#ifdef CONFIG_X86_64
-int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name)
-{
- unsigned int cpu, ncpu = 0;
-
- if (symnum >= num_possible_cpus())
- return -EINVAL;
-
- for_each_possible_cpu(cpu) {
- if (ncpu++ >= symnum)
- break;
- }
-
- *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline;
- *type = 't';
- strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN);
-
- return 0;
-}
-#endif
-
static __init void setup_cpu_entry_area_ptes(void)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a12afff146d1..fc37bbd23eb8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -19,7 +19,9 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
+#include <linux/pci.h>
+#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
+static void note_wx(struct pg_state *st)
+{
+ unsigned long npages;
+
+ npages = (st->current_address - st->start_address) / PAGE_SIZE;
+
+#ifdef CONFIG_PCI_BIOS
+ /*
+ * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
+ * Inform about it, but avoid the warning.
+ */
+ if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
+ st->current_address <= PAGE_OFFSET + BIOS_END) {
+ pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
+ return;
+ }
+#endif
+ /* Account the WX pages */
+ st->wx_pages += npages;
+ WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+}
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta;
int width = sizeof(unsigned long) * 2;
- if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
- WARN_ONCE(1,
- "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
- (void *)st->start_address,
- (void *)st->start_address);
- st->wx_pages += (st->current_address -
- st->start_address) / PAGE_SIZE;
- }
+ if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
+ note_wx(st);
/*
* Now print the actual finished series
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 45f5d6cf65ae..6521134057e8 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,7 +8,8 @@
#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
- struct pt_regs *, int);
+ struct pt_regs *, int, unsigned long,
+ unsigned long);
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
@@ -22,7 +23,9 @@ ex_fixup_handler(const struct exception_table_entry *x)
}
__visible bool ex_handler_default(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
return true;
@@ -30,7 +33,9 @@ __visible bool ex_handler_default(const struct exception_table_entry *fixup,
EXPORT_SYMBOL(ex_handler_default);
__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
@@ -43,7 +48,9 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
* result of a refcount inc/dec/add/sub.
*/
__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
/* First unconditionally saturate the refcount. */
*(int *)regs->cx = INT_MIN / 2;
@@ -96,7 +103,9 @@ EXPORT_SYMBOL(ex_handler_refcount);
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
@@ -108,9 +117,79 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+/* Helper to check whether a uaccess fault indicates a kernel bug. */
+static bool bogus_uaccess(struct pt_regs *regs, int trapnr,
+ unsigned long fault_addr)
+{
+ /* This is the normal case: #PF with a fault address in userspace. */
+ if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX)
+ return false;
+
+ /*
+ * This code can be reached for machine checks, but only if the #MC
+ * handler has already decided that it looks like a candidate for fixup.
+ * This e.g. happens when attempting to access userspace memory which
+ * the CPU can't access because of uncorrectable bad memory.
+ */
+ if (trapnr == X86_TRAP_MC)
+ return false;
+
+ /*
+ * There are two remaining exception types we might encounter here:
+ * - #PF for faulting accesses to kernel addresses
+ * - #GP for faulting accesses to noncanonical addresses
+ * Complain about anything else.
+ */
+ if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) {
+ WARN(1, "unexpected trap %d in uaccess\n", trapnr);
+ return false;
+ }
+
+ /*
+ * This is a faulting memory access in kernel space, on a kernel
+ * address, in a usercopy function. This can e.g. be caused by improper
+ * use of helpers like __put_user and by improper attempts to access
+ * userspace addresses in KERNEL_DS regions.
+ * The one (semi-)legitimate exception are probe_kernel_{read,write}(),
+ * which can be invoked from places like kgdb, /dev/mem (for reading)
+ * and privileged BPF code (for reading).
+ * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag
+ * to tell us that faulting on kernel addresses, and even noncanonical
+ * addresses, in a userspace accessor does not necessarily imply a
+ * kernel bug, root might just be doing weird stuff.
+ */
+ if (current->kernel_uaccess_faults_ok)
+ return false;
+
+ /* This is bad. Refuse the fixup so that we go into die(). */
+ if (trapnr == X86_TRAP_PF) {
+ pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n",
+ fault_addr);
+ } else {
+ pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n");
+ }
+ return true;
+}
+
+__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
+{
+ if (bogus_uaccess(regs, trapnr, fault_addr))
+ return false;
+ regs->ip = ex_fixup_addr(fixup);
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_uaccess);
+
__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
+ if (bogus_uaccess(regs, trapnr, fault_addr))
+ return false;
/* Special hack for uaccess_err */
current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
@@ -119,7 +198,9 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
EXPORT_SYMBOL(ex_handler_ext);
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
@@ -134,7 +215,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx,
@@ -148,12 +231,14 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
- return ex_handler_default(fixup, regs, trapnr);
+ return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
}
EXPORT_SYMBOL(ex_handler_clear_fs);
@@ -170,7 +255,8 @@ __visible bool ex_has_fault_handler(unsigned long ip)
return handler == ex_handler_fault;
}
-int fixup_exception(struct pt_regs *regs, int trapnr)
+int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
+ unsigned long fault_addr)
{
const struct exception_table_entry *e;
ex_handler_t handler;
@@ -194,7 +280,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr)
return 0;
handler = ex_fixup_handler(e);
- return handler(e, regs, trapnr);
+ return handler(e, regs, trapnr, error_code, fault_addr);
}
extern unsigned int early_recursion_flag;
@@ -230,9 +316,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
* result in a hard-to-debug panic.
*
* Keep in mind that not all vectors actually get here. Early
- * fage faults, for example, are special.
+ * page faults, for example, are special.
*/
- if (fixup_exception(regs, trapnr))
+ if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
return;
if (fixup_bug(regs, trapnr))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b9123c497e0a..b24eb4eb9984 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/efi.h> /* efi_recover_from_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -25,6 +26,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
+#include <asm/efi.h> /* efi_recover_from_page_fault()*/
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -44,17 +46,19 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
- int ret = 0;
-
- /* kprobe_running() needs smp_processor_id() */
- if (kprobes_built_in() && !user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
- ret = 1;
- preempt_enable();
- }
-
- return ret;
+ if (!kprobes_built_in())
+ return 0;
+ if (user_mode(regs))
+ return 0;
+ /*
+ * To be potentially processing a kprobe fault and to be allowed to call
+ * kprobe_running(), we have to be non-preemptible.
+ */
+ if (preemptible())
+ return 0;
+ if (!kprobe_running())
+ return 0;
+ return kprobe_fault_handler(regs, X86_TRAP_PF);
}
/*
@@ -153,79 +157,6 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
-/*
- * A protection key fault means that the PKRU value did not allow
- * access to some PTE. Userspace can figure out what PKRU was
- * from the XSAVE state, and this function fills out a field in
- * siginfo so userspace can discover which protection key was set
- * on the PTE.
- *
- * If we get here, we know that the hardware signaled a X86_PF_PK
- * fault and that there was a VMA once we got in the fault
- * handler. It does *not* guarantee that the VMA we find here
- * was the one that we faulted on.
- *
- * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
- * 2. T1 : set PKRU to deny access to pkey=4, touches page
- * 3. T1 : faults...
- * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
- * 5. T1 : enters fault handler, takes mmap_sem, etc...
- * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
- * faulted on a pte with its pkey=4.
- */
-static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
- u32 *pkey)
-{
- /* This is effectively an #ifdef */
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return;
-
- /* Fault not from Protection Keys: nothing to do */
- if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
- return;
- /*
- * force_sig_info_fault() is called from a number of
- * contexts, some of which have a VMA and some of which
- * do not. The X86_PF_PK handing happens after we have a
- * valid VMA, so we should never reach this without a
- * valid VMA.
- */
- if (!pkey) {
- WARN_ONCE(1, "PKU fault with no VMA passed in");
- info->si_pkey = 0;
- return;
- }
- /*
- * si_pkey should be thought of as a strong hint, but not
- * absolutely guranteed to be 100% accurate because of
- * the race explained above.
- */
- info->si_pkey = *pkey;
-}
-
-static void
-force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, u32 *pkey, int fault)
-{
- unsigned lsb = 0;
- siginfo_t info;
-
- clear_siginfo(&info);
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
- if (fault & VM_FAULT_HWPOISON_LARGE)
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- if (fault & VM_FAULT_HWPOISON)
- lsb = PAGE_SHIFT;
- info.si_addr_lsb = lsb;
-
- fill_sig_info_pkey(si_signo, si_code, &info, pkey);
-
- force_sig_info(si_signo, &info, tsk);
-}
-
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);
@@ -709,7 +640,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
int sig;
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF)) {
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
* Any interrupt that takes a fault gets the fixup. This makes
* the below recursive fault logic only apply to a faults from
@@ -730,8 +661,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
- force_sig_info_fault(signal, si_code, address,
- tsk, NULL, 0);
+ force_sig_fault(signal, si_code, (void __user *)address,
+ tsk);
}
/*
@@ -789,6 +720,13 @@ no_context(struct pt_regs *regs, unsigned long error_code,
return;
/*
+ * Buggy firmware could access regions which might page fault, try to
+ * recover from such faults.
+ */
+ if (IS_ENABLED(CONFIG_EFI))
+ efi_recover_from_page_fault(address);
+
+ /*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
*/
@@ -837,12 +775,21 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT "\n");
- show_opcodes((u8 *)regs->ip, loglvl);
+ show_opcodes(regs, loglvl);
+}
+
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, int si_code)
+ unsigned long address, u32 pkey, int si_code)
{
struct task_struct *tsk = current;
@@ -863,18 +810,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
-#ifdef CONFIG_X86_64
- /*
- * Instruction fetch faults in the vsyscall page might need
- * emulation.
- */
- if (unlikely((error_code & X86_PF_INSTR) &&
- ((address & ~0xfff) == VSYSCALL_ADDR))) {
- if (emulate_vsyscall(regs, address))
- return;
- }
-#endif
-
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
@@ -890,7 +825,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
+
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
return;
}
@@ -903,35 +841,29 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey)
+ unsigned long address)
{
- __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma, int si_code)
+ unsigned long address, u32 pkey, int si_code)
{
struct mm_struct *mm = current->mm;
- u32 pkey;
-
- if (vma)
- pkey = vma_pkey(vma);
-
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address,
- (vma) ? &pkey : NULL, si_code);
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+ __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
}
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
@@ -960,18 +892,40 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
* But, doing it this way allows compiler optimizations
* if pkeys are compiled out.
*/
- if (bad_area_access_from_pkeys(error_code, vma))
- __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
- else
- __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
+ if (bad_area_access_from_pkeys(error_code, vma)) {
+ /*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state. This function captures the pkey from
+ * the vma and passes it to userspace so userspace can discover
+ * which protection key was set on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a X86_PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+ u32 pkey = vma_pkey(vma);
+
+ __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ } else {
+ __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ }
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- u32 *pkey, unsigned int fault)
+ unsigned int fault)
{
struct task_struct *tsk = current;
- int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) {
@@ -989,18 +943,25 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
- printk(KERN_ERR
+ unsigned lsb = 0;
+
+ pr_err(
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
- code = BUS_MCEERR_AR;
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk);
+ return;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, vm_fault_t fault)
+ unsigned long address, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1024,27 +985,21 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, pkey, fault);
+ do_sigbus(regs, error_code, address, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address, pkey);
+ bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
}
-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
- /*
- * Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set X86_PF_PK.
- */
- if ((error_code & X86_PF_PK))
- return 1;
return 1;
}
@@ -1071,7 +1026,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
-spurious_fault(unsigned long error_code, unsigned long address)
+spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1102,27 +1057,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
- return spurious_fault_check(error_code, (pte_t *) p4d);
+ return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
- return spurious_fault_check(error_code, (pte_t *) pud);
+ return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
- return spurious_fault_check(error_code, (pte_t *) pmd);
+ return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
- ret = spurious_fault_check(error_code, pte);
+ ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
@@ -1130,12 +1085,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
- ret = spurious_fault_check(error_code, (pte_t *) pmd);
+ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
-NOKPROBE_SYMBOL(spurious_fault);
+NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
@@ -1182,6 +1137,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
+ /*
+ * On 64-bit systems, the vsyscall page is at an address above
+ * TASK_SIZE_MAX, but is not considered part of the kernel
+ * address space.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
+ return false;
+
return address >= TASK_SIZE_MAX;
}
@@ -1203,31 +1166,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
- * This routine handles page faults. It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
+ * Called for all faults where 'address' is part of the kernel address
+ * space. Might get called for faults that originate from *code* that
+ * ran in userspace or the kernel.
*/
-static noinline void
-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+static void
+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
{
- struct vm_area_struct *vma;
- struct task_struct *tsk;
- struct mm_struct *mm;
- vm_fault_t fault, major = 0;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- u32 pkey;
-
- tsk = current;
- mm = tsk->mm;
-
- prefetchw(&mm->mmap_sem);
-
- if (unlikely(kmmio_fault(regs, address)))
- return;
+ /*
+ * Protection keys exceptions only happen on user pages. We
+ * have no user pages in the kernel portion of the address
+ * space, so do not expect them here.
+ */
+ WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
- * We fault-in kernel-space virtual memory on-demand. The
+ * We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
@@ -1235,41 +1190,73 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* only copy the information from the master page table,
* nothing more.
*
- * This verifies that the fault happens in kernel space
- * (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 9) == 0.
+ * Before doing this on-demand faulting, ensure that the
+ * fault is not any of the following:
+ * 1. A fault on a PTE with a reserved bit set.
+ * 2. A fault caused by a user-mode access. (Do not demand-
+ * fault kernel memory due to user-mode accesses).
+ * 3. A fault caused by a page-level protection violation.
+ * (A demand fault would be on a non-present page which
+ * would have X86_PF_PROT==0).
*/
- if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
-
- /* Can handle a stale RO->RW TLB: */
- if (spurious_fault(error_code, address))
+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
return;
+ }
- /* kprobes don't want to hook the spurious faults: */
- if (kprobes_fault(regs))
- return;
- /*
- * Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock:
- */
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /* Was the fault spurious, caused by lazy TLB invalidation? */
+ if (spurious_kernel_fault(hw_error_code, address))
+ return;
+ /* kprobes don't want to hook the spurious faults: */
+ if (kprobes_fault(regs))
return;
- }
+
+ /*
+ * Note, despite being a "bad area", there are quite a few
+ * acceptable reasons to get here, such as erratum fixups
+ * and handling kernel code that can fault, like get_user().
+ *
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock:
+ */
+ bad_area_nosemaphore(regs, hw_error_code, address);
+}
+NOKPROBE_SYMBOL(do_kern_addr_fault);
+
+/* Handle faults in the user portion of the address space */
+static inline
+void do_user_addr_fault(struct pt_regs *regs,
+ unsigned long hw_error_code,
+ unsigned long address)
+{
+ unsigned long sw_error_code;
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ vm_fault_t fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ tsk = current;
+ mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & X86_PF_RSVD))
- pgtable_bad(regs, error_code, address);
+ /*
+ * Reserved bits are never expected to be set on
+ * entries in the user portion of the page tables.
+ */
+ if (unlikely(hw_error_code & X86_PF_RSVD))
+ pgtable_bad(regs, hw_error_code, address);
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /*
+ * Check for invalid kernel (supervisor) access to user
+ * pages in the user address space.
+ */
+ if (unlikely(smap_violation(hw_error_code, regs))) {
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
@@ -1278,11 +1265,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
/*
+ * hw_error_code is literally the "page fault error code" passed to
+ * the kernel directly from the hardware. But, we will shortly be
+ * modifying it in software, so give it a new name.
+ */
+ sw_error_code = hw_error_code;
+
+ /*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
@@ -1291,7 +1285,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= X86_PF_USER;
+ /*
+ * Up to this point, X86_PF_USER set in hw_error_code
+ * indicated a user-mode access. But, after this,
+ * X86_PF_USER in sw_error_code will indicate either
+ * that, *or* an implicit kernel(supervisor)-mode access
+ * which originated from user mode.
+ */
+ if (!(hw_error_code & X86_PF_USER)) {
+ /*
+ * The CPU was in user mode, but the CPU says
+ * the fault was not a user-mode access.
+ * Must be an implicit kernel-mode access,
+ * which we do not expect to happen in the
+ * user address space.
+ */
+ pr_warn_once("kernel-mode error from user-mode: %lx\n",
+ hw_error_code);
+
+ sw_error_code |= X86_PF_USER;
+ }
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1300,31 +1313,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & X86_PF_WRITE)
+ if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & X86_PF_INSTR)
+ if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_X86_64
+ /*
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation. The vsyscall page is at a high address
+ * (>PAGE_OFFSET), but is considered to be part of the user
+ * address space.
+ *
+ * The vsyscall page does not have a "real" VMA, so do this
+ * emulation before we go searching for VMAs.
+ */
+ if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+
/*
- * When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in
- * the kernel and should generate an OOPS. Unfortunately, in the
- * case of an erroneous fault occurring in a code path which already
- * holds mmap_sem we will deadlock attempting to validate the fault
- * against the address space. Luckily the kernel only validly
- * references user space from well defined areas of code, which are
- * listed in the exceptions table.
+ * Kernel-mode access to the user address space should only occur
+ * on well-defined single instructions listed in the exception
+ * tables. But, an erroneous kernel fault occurring outside one of
+ * those areas which also holds mmap_sem might deadlock attempting
+ * to validate the fault against the address space.
*
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a
- * deadlock. Attempt to lock the address space, if we cannot we then
- * validate the source. If this is invalid we can skip the address
- * space check, thus avoiding the deadlock:
+ * Only do the expensive exception table search when we might be at
+ * risk of a deadlock. This happens if we
+ * 1. Failed to acquire mmap_sem, and
+ * 2. The access did not originate in userspace. Note: either the
+ * hardware or earlier page fault code may set X86_PF_USER
+ * in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if (!(error_code & X86_PF_USER) &&
+ if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /*
+ * Fault from code in kernel from
+ * which we do not expect faults.
+ */
+ bad_area_nosemaphore(regs, sw_error_code, address);
return;
}
retry:
@@ -1340,16 +1371,16 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
- if (error_code & X86_PF_USER) {
+ if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1357,12 +1388,12 @@ retry:
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
@@ -1371,8 +1402,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ if (unlikely(access_error(sw_error_code, vma))) {
+ bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
@@ -1388,10 +1419,7 @@ good_area:
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
- * Thus we have to be careful about not touching vma after handling the
- * fault, so we read the pkey beforehand.
*/
- pkey = vma_pkey(vma);
fault = handle_mm_fault(vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
@@ -1414,13 +1442,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, &pkey, fault);
+ mm_fault_error(regs, sw_error_code, address, fault);
return;
}
@@ -1438,6 +1466,28 @@ good_area:
check_v8086_mode(regs, address, tsk);
}
+NOKPROBE_SYMBOL(do_user_addr_fault);
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
+{
+ prefetchw(&current->mm->mmap_sem);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /* Was the fault on kernel-controlled part of the address space? */
+ if (unlikely(fault_in_kernel_space(address)))
+ do_kern_addr_fault(regs, hw_error_code, address);
+ else
+ do_user_addr_fault(regs, hw_error_code, address);
+}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26c1115..faca978ebf9d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}
+void __weak mem_encrypt_free_decrypted_mem(void) { }
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
+ mem_encrypt_free_decrypted_mem();
+
free_kernel_image_pages(&__init_begin, &__init_end);
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 979e0a02cbe1..142c7d9f89cc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
+ unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
- start, start+size);
- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
-
- printk(KERN_INFO "Testing CPA: write protecting again\n");
- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
-
- start += size;
- size = (unsigned long)__end_rodata - start;
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
- size >> 10);
-
-#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Testing CPA: write protecting again\n");
+ pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
- unsigned long size, enum page_cache_mode pcm, void *caller)
+ unsigned long size, enum page_cache_mode pcm,
+ void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
- if (sev_active() && mem_flags.desc_other)
+ if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+ __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b2de398d1fd3..006f373f54ab 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -348,6 +348,30 @@ bool sev_active(void)
EXPORT_SYMBOL(sev_active);
/* Architecture __weak replacement functions */
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (mem_encrypt_active()) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
+
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 7ae36868aed2..a19ef1a416ff 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -27,6 +27,7 @@
* be extended when new paravirt and debugging variants are added.)
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
#include <linux/kernel.h>
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index e500949bae24..2385538e8065 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -118,14 +118,11 @@ bad_opcode:
* anything it wants in to the instructions. We can not
* trust anything about it. They might not be valid
* instructions or might encode invalid registers, etc...
- *
- * The caller is expected to kfree() the returned siginfo_t.
*/
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
{
const struct mpx_bndreg_state *bndregs;
const struct mpx_bndreg *bndreg;
- siginfo_t *info = NULL;
struct insn insn;
uint8_t bndregno;
int err;
@@ -153,11 +150,6 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
/* now go select the individual register in the set of 4 */
bndreg = &bndregs->bndreg[bndregno];
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info) {
- err = -ENOMEM;
- goto err_out;
- }
/*
* The registers are always 64-bit, but the upper 32
* bits are ignored in 32-bit mode. Also, note that the
@@ -168,27 +160,23 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
* complains when casting from integers to different-size
* pointers.
*/
- info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
- info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
- info->si_addr_lsb = 0;
- info->si_signo = SIGSEGV;
- info->si_errno = 0;
- info->si_code = SEGV_BNDERR;
- info->si_addr = insn_get_addr_ref(&insn, regs);
+ info->lower = (void __user *)(unsigned long)bndreg->lower_bound;
+ info->upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+ info->addr = insn_get_addr_ref(&insn, regs);
+
/*
* We were not able to extract an address from the instruction,
* probably because there was something invalid in it.
*/
- if (info->si_addr == (void __user *)-1) {
+ if (info->addr == (void __user *)-1) {
err = -EINVAL;
goto err_out;
}
- trace_mpx_bounds_register_exception(info->si_addr, bndreg);
- return info;
+ trace_mpx_bounds_register_exception(info->addr, bndreg);
+ return 0;
err_out:
/* info might be NULL, but kfree() handles that */
- kfree(info);
- return ERR_PTR(err);
+ return err;
}
static __user void *mpx_get_bounds_dir(void)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8d6c34fe49be..62bb30b4bd2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -37,11 +37,20 @@ struct cpa_data {
unsigned long numpages;
int flags;
unsigned long pfn;
- unsigned force_split : 1;
+ unsigned force_split : 1,
+ force_static_prot : 1;
int curpage;
struct page **pages;
};
+enum cpa_warn {
+ CPA_CONFLICT,
+ CPA_PROTECT,
+ CPA_DETECT,
+};
+
+static const int cpa_warn_level = CPA_PROTECT;
+
/*
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { }
#endif
+#ifdef CONFIG_X86_CPA_STATISTICS
+
+static unsigned long cpa_1g_checked;
+static unsigned long cpa_1g_sameprot;
+static unsigned long cpa_1g_preserved;
+static unsigned long cpa_2m_checked;
+static unsigned long cpa_2m_sameprot;
+static unsigned long cpa_2m_preserved;
+static unsigned long cpa_4k_install;
+
+static inline void cpa_inc_1g_checked(void)
+{
+ cpa_1g_checked++;
+}
+
+static inline void cpa_inc_2m_checked(void)
+{
+ cpa_2m_checked++;
+}
+
+static inline void cpa_inc_4k_install(void)
+{
+ cpa_4k_install++;
+}
+
+static inline void cpa_inc_lp_sameprot(int level)
+{
+ if (level == PG_LEVEL_1G)
+ cpa_1g_sameprot++;
+ else
+ cpa_2m_sameprot++;
+}
+
+static inline void cpa_inc_lp_preserved(int level)
+{
+ if (level == PG_LEVEL_1G)
+ cpa_1g_preserved++;
+ else
+ cpa_2m_preserved++;
+}
+
+static int cpastats_show(struct seq_file *m, void *p)
+{
+ seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
+ seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
+ seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
+ seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
+ seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
+ seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
+ seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
+ return 0;
+}
+
+static int cpastats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpastats_show, NULL);
+}
+
+static const struct file_operations cpastats_fops = {
+ .open = cpastats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init cpa_stats_init(void)
+{
+ debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
+ &cpastats_fops);
+ return 0;
+}
+late_initcall(cpa_stats_init);
+#else
+static inline void cpa_inc_1g_checked(void) { }
+static inline void cpa_inc_2m_checked(void) { }
+static inline void cpa_inc_4k_install(void) { }
+static inline void cpa_inc_lp_sameprot(int level) { }
+static inline void cpa_inc_lp_preserved(int level) { }
+#endif
+
+
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-static void __cpa_flush_range(void *arg)
+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
{
- /*
- * We could optimize that further and do individual per page
- * tlb invalidates for a low number of pages. Caveat: we must
- * flush the high aliases on 64bit as well.
- */
- __flush_tlb_all();
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ cpa_flush_all(cache);
+ return true;
+ }
+
+ flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
+
+ return !cache;
}
static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
- BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
- WARN_ON(PAGE_ALIGN(start) != start);
-
- on_each_cpu(__cpa_flush_range, NULL, 1);
-
- if (!cache)
+ if (__cpa_flush_range(start, numpages, cache))
return;
/*
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
+ int numpages, int cache,
int in_flags, struct page **pages)
{
unsigned int i, level;
-#ifdef CONFIG_PREEMPT
- /*
- * Avoid wbinvd() because it causes latencies on all CPUs,
- * regardless of any CPU isolation that may be in effect.
- *
- * This should be extended for CAT enabled systems independent of
- * PREEMPT because wbinvd() does not respect the CAT partitions and
- * this is exposed to unpriviledged users through the graphics
- * subsystem.
- */
- unsigned long do_wbinvd = 0;
-#else
- unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
-#endif
-
- BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-
- on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
- if (!cache || do_wbinvd)
+ if (__cpa_flush_range(baddr, numpages, cache))
return;
/*
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
}
}
-/*
- * Certain areas of memory on x86 require very specific protection flags,
- * for example the BIOS area or kernel text. Callers don't always get this
- * right (again, ioremap() on BIOS memory is not uncommon) so this function
- * checks and fixes these known static required protection bits.
- */
-static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
- unsigned long pfn)
+static bool overlaps(unsigned long r1_start, unsigned long r1_end,
+ unsigned long r2_start, unsigned long r2_end)
{
- pgprot_t forbidden = __pgprot(0);
+ return (r1_start <= r2_end && r1_end >= r2_start) ||
+ (r2_start <= r1_end && r2_end >= r1_start);
+}
- /*
- * The BIOS area between 640k and 1Mb needs to be executable for
- * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
- */
#ifdef CONFIG_PCI_BIOS
- if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
- pgprot_val(forbidden) |= _PAGE_NX;
+/*
+ * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
+ * based config access (CONFIG_PCI_GOBIOS) support.
+ */
+#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
+#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
+
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+ if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
+ return _PAGE_NX;
+ return 0;
+}
+#else
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+ return 0;
+}
#endif
- /*
- * The kernel text needs to be executable for obvious reasons
- * Does not cover __inittext since that is gone later on. On
- * 64bit we do not enforce !NX on the low mapping
- */
- if (within(address, (unsigned long)_text, (unsigned long)_etext))
- pgprot_val(forbidden) |= _PAGE_NX;
+/*
+ * The .rodata section needs to be read-only. Using the pfn catches all
+ * aliases. This also includes __ro_after_init, so do not enforce until
+ * kernel_set_to_readonly is true.
+ */
+static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/*
- * The .rodata section needs to be read-only. Using the pfn
- * catches all aliases. This also includes __ro_after_init,
- * so do not enforce until kernel_set_to_readonly is true.
+ * Note: __end_rodata is at page aligned and not inclusive, so
+ * subtract 1 to get the last enforced PFN in the rodata area.
*/
- if (kernel_set_to_readonly &&
- within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
- __pa_symbol(__end_rodata) >> PAGE_SHIFT))
- pgprot_val(forbidden) |= _PAGE_RW;
+ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
+
+ if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
+ return _PAGE_RW;
+ return 0;
+}
+
+/*
+ * Protect kernel text against becoming non executable by forbidding
+ * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
+ * out of which the kernel actually executes. Do not protect the low
+ * mapping.
+ *
+ * This does not cover __inittext since that is gone after boot.
+ */
+static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
+{
+ unsigned long t_end = (unsigned long)_etext - 1;
+ unsigned long t_start = (unsigned long)_text;
+
+ if (overlaps(start, end, t_start, t_end))
+ return _PAGE_NX;
+ return 0;
+}
#if defined(CONFIG_X86_64)
+/*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering the
+ * holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data at no
+ * extra cost.
+ */
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
+ unsigned long end)
+{
+ unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
+ unsigned long t_start = (unsigned long)_text;
+ unsigned int level;
+
+ if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
+ return 0;
/*
- * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
- * kernel text mappings for the large page aligned text, rodata sections
- * will be always read-only. For the kernel identity mappings covering
- * the holes caused by this alignment can be anything that user asks.
+ * Don't enforce the !RW mapping for the kernel text mapping, if
+ * the current mapping is already using small page mapping. No
+ * need to work hard to preserve large page mappings in this case.
*
- * This will preserve the large page mappings for kernel text/data
- * at no extra cost.
+ * This also fixes the Linux Xen paravirt guest boot failure caused
+ * by unexpected read-only mappings for kernel identity
+ * mappings. In this paravirt guest case, the kernel text mapping
+ * and the kernel identity mapping share the same page-table pages,
+ * so the protections for kernel text and identity mappings have to
+ * be the same.
*/
- if (kernel_set_to_readonly &&
- within(address, (unsigned long)_text,
- (unsigned long)__end_rodata_hpage_align)) {
- unsigned int level;
-
- /*
- * Don't enforce the !RW mapping for the kernel text mapping,
- * if the current mapping is already using small page mapping.
- * No need to work hard to preserve large page mappings in this
- * case.
- *
- * This also fixes the Linux Xen paravirt guest boot failure
- * (because of unexpected read-only mappings for kernel identity
- * mappings). In this paravirt guest case, the kernel text
- * mapping and the kernel identity mapping share the same
- * page-table pages. Thus we can't really use different
- * protections for the kernel text and identity mappings. Also,
- * these shared mappings are made of small page mappings.
- * Thus this don't enforce !RW mapping for small page kernel
- * text mapping logic will help Linux Xen parvirt guest boot
- * as well.
- */
- if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
- pgprot_val(forbidden) |= _PAGE_RW;
- }
+ if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
+ return _PAGE_RW;
+ return 0;
+}
+#else
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
+ unsigned long end)
+{
+ return 0;
+}
#endif
- prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+static inline bool conflicts(pgprot_t prot, pgprotval_t val)
+{
+ return (pgprot_val(prot) & ~val) != pgprot_val(prot);
+}
- return prot;
+static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
+ unsigned long start, unsigned long end,
+ unsigned long pfn, const char *txt)
+{
+ static const char *lvltxt[] = {
+ [CPA_CONFLICT] = "conflict",
+ [CPA_PROTECT] = "protect",
+ [CPA_DETECT] = "detect",
+ };
+
+ if (warnlvl > cpa_warn_level || !conflicts(prot, val))
+ return;
+
+ pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
+ lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
+ (unsigned long long)val);
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
+ unsigned long pfn, unsigned long npg,
+ int warnlvl)
+{
+ pgprotval_t forbidden, res;
+ unsigned long end;
+
+ /*
+ * There is no point in checking RW/NX conflicts when the requested
+ * mapping is setting the page !PRESENT.
+ */
+ if (!(pgprot_val(prot) & _PAGE_PRESENT))
+ return prot;
+
+ /* Operate on the virtual address */
+ end = start + npg * PAGE_SIZE - 1;
+
+ res = protect_kernel_text(start, end);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
+ forbidden = res;
+
+ res = protect_kernel_text_ro(start, end);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
+ forbidden |= res;
+
+ /* Check the PFN directly */
+ res = protect_pci_bios(pfn, pfn + npg - 1);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
+ forbidden |= res;
+
+ res = protect_rodata(pfn, pfn + npg - 1);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
+ forbidden |= res;
+
+ return __pgprot(pgprot_val(prot) & ~forbidden);
}
/*
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
- return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
- if (cpa->pgd)
+ if (cpa->pgd)
return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
- return lookup_address(address, level);
+ return lookup_address(address, level);
}
/*
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
return prot;
}
-static int
-try_preserve_large_page(pte_t *kpte, unsigned long address,
- struct cpa_data *cpa)
+static int __should_split_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
+ unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
+ pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot, req_prot;
- int i, do_split = 1;
enum pg_level level;
- if (cpa->force_split)
- return 1;
-
- spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
- goto out_unlock;
+ return 1;
switch (level) {
case PG_LEVEL_2M:
old_prot = pmd_pgprot(*(pmd_t *)kpte);
old_pfn = pmd_pfn(*(pmd_t *)kpte);
+ cpa_inc_2m_checked();
break;
case PG_LEVEL_1G:
old_prot = pud_pgprot(*(pud_t *)kpte);
old_pfn = pud_pfn(*(pud_t *)kpte);
+ cpa_inc_1g_checked();
break;
default:
- do_split = -EINVAL;
- goto out_unlock;
+ return -EINVAL;
}
psize = page_level_size(level);
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Calculate the number of pages, which fit into this large
* page starting at address:
*/
- nextpage_addr = (address + psize) & pmask;
- numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+ lpaddr = (address + psize) & pmask;
+ numpages = (lpaddr - address) >> PAGE_SHIFT;
if (numpages < cpa->numpages)
cpa->numpages = numpages;
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pgprot_val(req_prot) |= _PAGE_PSE;
/*
- * old_pfn points to the large page base pfn. So we need
- * to add the offset of the virtual address:
+ * old_pfn points to the large page base pfn. So we need to add the
+ * offset of the virtual address:
*/
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(req_prot, address, pfn);
+ /*
+ * Calculate the large page base address and the number of 4K pages
+ * in the large page
+ */
+ lpaddr = address & pmask;
+ numpages = psize >> PAGE_SHIFT;
/*
- * We need to check the full range, whether
- * static_protection() requires a different pgprot for one of
- * the pages in the range we try to preserve:
+ * Sanity check that the existing mapping is correct versus the static
+ * protections. static_protections() guards against !PRESENT, so no
+ * extra conditional required here.
*/
- addr = address & pmask;
- pfn = old_pfn;
- for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
+ CPA_CONFLICT);
- if (pgprot_val(chk_prot) != pgprot_val(new_prot))
- goto out_unlock;
+ if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
+ /*
+ * Split the large page and tell the split code to
+ * enforce static protections.
+ */
+ cpa->force_static_prot = 1;
+ return 1;
}
/*
- * If there are no changes, return. maxpages has been updated
- * above:
+ * Optimization: If the requested pgprot is the same as the current
+ * pgprot, then the large page can be preserved and no updates are
+ * required independent of alignment and length of the requested
+ * range. The above already established that the current pgprot is
+ * correct, which in consequence makes the requested pgprot correct
+ * as well if it is the same. The static protection scan below will
+ * not come to a different conclusion.
*/
- if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
- do_split = 0;
- goto out_unlock;
+ if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
+ cpa_inc_lp_sameprot(level);
+ return 0;
}
/*
- * We need to change the attributes. Check, whether we can
- * change the large page in one go. We request a split, when
- * the address is not aligned and the number of pages is
- * smaller than the number of pages in the large page. Note
- * that we limited the number of possible pages already to
- * the number of pages in the large page.
+ * If the requested range does not cover the full page, split it up
*/
- if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
- /*
- * The address is aligned and the number of pages
- * covers the full page.
- */
- new_pte = pfn_pte(old_pfn, new_prot);
- __set_pmd_pte(kpte, address, new_pte);
- cpa->flags |= CPA_FLUSHTLB;
- do_split = 0;
- }
+ if (address != lpaddr || cpa->numpages != numpages)
+ return 1;
+
+ /*
+ * Check whether the requested pgprot is conflicting with a static
+ * protection requirement in the large page.
+ */
+ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
+ CPA_DETECT);
+
+ /*
+ * If there is a conflict, split the large page.
+ *
+ * There used to be a 4k wise evaluation trying really hard to
+ * preserve the large pages, but experimentation has shown, that this
+ * does not help at all. There might be corner cases which would
+ * preserve one large page occasionally, but it's really not worth the
+ * extra code and cycles for the common case.
+ */
+ if (pgprot_val(req_prot) != pgprot_val(new_prot))
+ return 1;
-out_unlock:
+ /* All checks passed. Update the large page mapping. */
+ new_pte = pfn_pte(old_pfn, new_prot);
+ __set_pmd_pte(kpte, address, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ cpa_inc_lp_preserved(level);
+ return 0;
+}
+
+static int should_split_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ int do_split;
+
+ if (cpa->force_split)
+ return 1;
+
+ spin_lock(&pgd_lock);
+ do_split = __should_split_large_page(kpte, address, cpa);
spin_unlock(&pgd_lock);
return do_split;
}
+static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
+ pgprot_t ref_prot, unsigned long address,
+ unsigned long size)
+{
+ unsigned int npg = PFN_DOWN(size);
+ pgprot_t prot;
+
+ /*
+ * If should_split_large_page() discovered an inconsistent mapping,
+ * remove the invalid protection in the split mapping.
+ */
+ if (!cpa->force_static_prot)
+ goto set;
+
+ prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
+
+ if (pgprot_val(prot) == pgprot_val(ref_prot))
+ goto set;
+
+ /*
+ * If this is splitting a PMD, fix it up. PUD splits cannot be
+ * fixed trivially as that would require to rescan the newly
+ * installed PMD mappings after returning from split_large_page()
+ * so an eventual further split can allocate the necessary PTE
+ * pages. Warn for now and revisit it in case this actually
+ * happens.
+ */
+ if (size == PAGE_SIZE)
+ ref_prot = prot;
+ else
+ pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
+set:
+ set_pte(pte, pfn_pte(pfn, ref_prot));
+}
+
static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
+ unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
pte_t *pbase = (pte_t *)page_address(base);
- unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
- pte_t *tmp;
pgprot_t ref_prot;
+ pte_t *tmp;
spin_lock(&pgd_lock);
/*
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* PAT bit to correct position.
*/
ref_prot = pgprot_large_2_4k(ref_prot);
-
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+ lpaddr = address & PMD_MASK;
+ lpinc = PAGE_SIZE;
break;
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
-
+ lpaddr = address & PUD_MASK;
+ lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Get the target pfn from the original entry:
*/
pfn = ref_pfn;
- for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
- set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+ for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
+ split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) {
unsigned long pfn = PFN_DOWN(__pa(address));
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/*
- * Intel Atom errata AAH41 workaround.
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * Without this, we violate the TLB application note, that says:
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
*
- * The real fix should be in hw or in a microcode update, but
- * we also probabilistically try to reduce the window of having
- * a large TLB mixed with 4K TLBs while instruction fetches are
- * going on.
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
*/
- __flush_tlb_all();
+ flush_tlb_all();
spin_unlock(&pgd_lock);
return 0;
@@ -1247,7 +1494,9 @@ repeat:
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
- new_prot = static_protections(new_prot, address, pfn);
+ cpa_inc_4k_install();
+ new_prot = static_protections(new_prot, address, pfn, 1,
+ CPA_PROTECT);
new_prot = pgprot_clear_protnone_bits(new_prot);
@@ -1273,7 +1522,7 @@ repeat:
* Check, whether we can keep the large page intact
* and just change the pte:
*/
- do_split = try_preserve_large_page(kpte, address, cpa);
+ do_split = should_split_large_page(kpte, address, cpa);
/*
* When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in
@@ -1286,28 +1535,8 @@ repeat:
* We have to split the large page:
*/
err = split_large_page(cpa, kpte, address);
- if (!err) {
- /*
- * Do a global flush tlb after splitting the large page
- * and before we do the actual change page attribute in the PTE.
- *
- * With out this, we violate the TLB application note, that says
- * "The TLBs may contain both ordinary and large-page
- * translations for a 4-KByte range of linear addresses. This
- * may occur if software modifies the paging structures so that
- * the page size used for the address range changes. If the two
- * translations differ with respect to page frame or attributes
- * (e.g., permissions), processor behavior is undefined and may
- * be implementation-specific."
- *
- * We do this global tlb flush inside the cpa_lock, so that we
- * don't allow any other cpu, with stale tlb entries change the
- * page attribute in parallel, that also falls into the
- * just split large page entry.
- */
- flush_tlb_all();
+ if (!err)
goto repeat;
- }
return err;
}
@@ -1420,6 +1649,29 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
return 0;
}
+/*
+ * Machine check recovery code needs to change cache mode of poisoned
+ * pages to UC to avoid speculative access logging another error. But
+ * passing the address of the 1:1 mapping to set_memory_uc() is a fine
+ * way to encourage a speculative access. So we cheat and flip the top
+ * bit of the address. This works fine for the code that updates the
+ * page tables. But at the end of the process we need to flush the cache
+ * and the non-canonical address causes a #GP fault when used by the
+ * CLFLUSH instruction.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long make_addr_canonical_again(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
+
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
@@ -1465,7 +1717,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
* Save address for cache flush. *addr is modified in the call
* to __change_page_attr_set_clr() below.
*/
- baddr = *addr;
+ baddr = make_addr_canonical_again(*addr);
}
/* Must avoid aliasing mappings in the highmem code */
@@ -1506,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
/*
- * On success we use CLFLUSH, when the CPU supports it to
- * avoid the WBINVD. If the CPU does not support it and in the
- * error case we fall back to cpa_flush_all (which uses
- * WBINVD):
+ * On error; flush everything to be sure.
*/
- if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
- if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
- cpa_flush_array(addr, numpages, cache,
- cpa.flags, pages);
- } else
- cpa_flush_range(baddr, numpages, cache);
- } else
+ if (ret) {
cpa_flush_all(cache);
+ goto out;
+ }
+
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(baddr, addr, numpages, cache,
+ cpa.flags, pages);
+ } else {
+ cpa_flush_range(baddr, numpages, cache);
+ }
out:
return ret;
@@ -1833,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- if (static_cpu_has(X86_FEATURE_CLFLUSH))
- cpa_flush_range(start, numpages, 1);
- else
- cpa_flush_all(1);
+ cpa_flush_range(start, numpages, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -1847,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
* in case TLB flushing gets optimized in the cpa_flush_range()
* path use the same logic as above.
*/
- if (static_cpu_has(X86_FEATURE_CLFLUSH))
- cpa_flush_range(start, numpages, 0);
- else
- cpa_flush_all(0);
+ cpa_flush_range(start, numpages, 0);
return ret;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e848a4811785..59274e2c1ac4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -115,6 +115,8 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+#define MAX_UNSHARED_PTRS_PER_PGD \
+ max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
@@ -181,6 +183,7 @@ static void pgd_dtor(pgd_t *pgd)
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
/*
* We allocate separate PMDs for the kernel part of the user page-table
@@ -189,6 +192,7 @@ static void pgd_dtor(pgd_t *pgd)
*/
#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
KERNEL_PGD_PTRS : 0)
+#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
@@ -210,7 +214,9 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
+#define MAX_PREALLOCATED_PMDS 0
#define PREALLOCATED_USER_PMDS 0
+#define MAX_PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
@@ -269,7 +275,7 @@ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
- *pgdp = native_make_pgd(0);
+ pgd_clear(pgdp);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -428,8 +434,8 @@ static inline void _pgd_free(pgd_t *pgd)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
- pmd_t *u_pmds[PREALLOCATED_USER_PMDS];
- pmd_t *pmds[PREALLOCATED_PMDS];
+ pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
+ pmd_t *pmds[MAX_PREALLOCATED_PMDS];
pgd = _pgd_alloc();
@@ -494,7 +500,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty)
- *ptep = entry;
+ set_pte(ptep, entry);
return changed;
}
@@ -509,7 +515,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -529,7 +535,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
if (changed && dirty) {
- *pudp = entry;
+ set_pud(pudp, entry);
/*
* We had a write-protection fault here and changed the pud
* to to more permissive. No need to flush the TLB for that,
@@ -637,6 +643,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
+#ifdef CONFIG_X86_64
+ /*
+ * Ensure that the static initial page tables are covering the
+ * fixmap completely.
+ */
+ BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
+ (FIXMAP_PMD_NUM * PTRS_PER_PTE));
+#endif
+
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 31341ae7309f..4fee5c3003ed 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -248,7 +248,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -434,11 +434,42 @@ static void __init pti_clone_p4d(unsigned long addr)
}
/*
- * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ * Clone the CPU_ENTRY_AREA and associated data into the user space visible
+ * page table.
*/
static void __init pti_clone_user_shared(void)
{
+ unsigned int cpu;
+
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * The SYSCALL64 entry code needs to be able to find the
+ * thread stack and needs one word of scratch space in which
+ * to spill a register. All of this lives in the TSS, in
+ * the sp1 and sp2 slots.
+ *
+ * This is done for all possible CPUs during boot to ensure
+ * that it's propagated to all mms. If we were to add one of
+ * these mappings during CPU hotplug, we would need to take
+ * some measure to make sure that every mm that subsequently
+ * ran on that CPU would have the relevant PGD entry in its
+ * pagetables. The usual vmalloc_fault() mechanism would not
+ * work for page faults taken in entry_SYSCALL_64 before RSP
+ * is set up.
+ */
+
+ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
+ phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
+ pte_t *target_pte;
+
+ target_pte = pti_user_pagetable_walk_pte(va);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
+ }
}
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..bddd6b3cee1d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -180,13 +181,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}
+static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+{
+ /*
+ * Check if the current (previous) task has access to the memory
+ * of the @tsk (next) task. If access is denied, make sure to
+ * issue a IBPB to stop user->user Spectre-v2 attacks.
+ *
+ * Note: __ptrace_may_access() returns 0 or -ERRNO.
+ */
+ return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
+ ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+}
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -240,20 +257,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * We don't currently support having a real mm loaded without
- * our cpu set in mm_cpumask(). We have all the bookkeeping
- * in place to figure out whether we would need to flush
- * if our cpu were cleared in mm_cpumask(), but we don't
- * currently use it.
+ * Even in lazy TLB mode, the CPU should stay set in the
+ * mm_cpumask. The TLB shootdown code can figure out from
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
+ /*
+ * If the CPU is not in lazy TLB mode, we are just switching
+ * from one thread in a process to another thread in the same
+ * process. No TLB flush required.
+ */
+ if (!was_lazy)
+ return;
+
+ /*
+ * Read the tlb_gen to check whether a flush is needed.
+ * If the TLB is up to date, just use it.
+ * The barrier synchronizes with the tlb_gen increment in
+ * the TLB shootdown code.
+ */
+ smp_mb();
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ next_tlb_gen)
+ return;
+
+ /*
+ * TLB contents went out of date while we were in lazy
+ * mode. Fall through to the TLB switching code below.
+ */
+ new_asid = prev_asid;
+ need_flush = true;
} else {
- u16 new_asid;
- bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -262,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* one process from doing Spectre-v2 attacks on another.
*
* As an optimization, flush indirect branches only when
- * switching into processes that disable dumping. This
- * protects high value processes like gpg, without having
- * too high performance overhead. IBPB is *expensive*!
- *
- * This will not flush branches when switching into kernel
- * threads. It will also not flush if we switch to idle
- * thread and back to the same process. It will flush if we
- * switch to a different non-dumpable process.
+ * switching into a processes that can't be ptrace by the
+ * current one (as in such case, attacker has much more
+ * convenient way how to tamper with the next process than
+ * branch buffer poisoning).
*/
- if (tsk && tsk->mm &&
- tsk->mm->context.ctx_id != last_ctx_id &&
- get_dumpable(tsk->mm) != SUID_DUMP_USER)
+ if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
+ ibpb_needed(tsk, last_ctx_id))
indirect_branch_prediction_barrier();
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -305,42 +338,51 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
-
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+ }
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
*/
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
- load_mm_cr4(next);
- switch_ldt(real_prev, next);
+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
+ if (next != real_prev) {
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+ }
}
/*
@@ -361,20 +403,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- if (tlb_defer_switch_to_init_mm()) {
- /*
- * There's a significant optimization that may be possible
- * here. We have accurate enough TLB flush tracking that we
- * don't need to maintain coherence of TLB per se when we're
- * lazy. We do, however, need to maintain coherence of
- * paging-structure caches. We could, in principle, leave our
- * old mm loaded and only switch to init_mm when
- * tlb_remove_page() happens.
- */
- this_cpu_write(cpu_tlbstate.is_lazy, true);
- } else {
- switch_mm(NULL, &init_mm, NULL);
- }
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -461,6 +490,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -521,17 +553,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long addr;
- unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+ unsigned long addr = f->start;
- addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
- addr += PAGE_SIZE;
+ addr += 1UL << f->stride_shift;
}
if (local)
- count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
- trace_tlb_flush(reason, nr_pages);
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+ trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
@@ -564,6 +595,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
+static bool tlb_is_not_lazy(int cpu, void *data)
+{
+ return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -599,8 +635,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+ /*
+ * If no page tables were freed, we can skip sending IPIs to
+ * CPUs in lazy TLB mode. They will flush the CPU themselves
+ * at the next context switch.
+ *
+ * However, if page tables are getting freed, we need to send the
+ * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+ * up on the new contents of what used to be page tables, while
+ * doing a speculative memory access.
+ */
+ if (info->freed_tables)
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
+ else
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+ (void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
@@ -616,12 +667,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag)
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
+ .stride_shift = stride_shift,
+ .freed_tables = freed_tables,
};
cpu = get_cpu();
@@ -631,8 +685,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
- !(vmflag & VM_HUGETLB) &&
- ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 5559dcaddd5e..948656069cdd 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -356,7 +356,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
} else {
struct pci_root_info *info;
- info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
dev_err(&root->device->dev,
"pci_bus %04x:%02x: ignored (out of memory)\n",
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 649bdde63e32..bfa50e65ef6c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -93,7 +93,8 @@ static int __init early_root_info_init(void)
vendor = id & 0xffff;
device = (id>>16) & 0xffff;
- if (vendor != PCI_VENDOR_ID_AMD)
+ if (vendor != PCI_VENDOR_ID_AMD &&
+ vendor != PCI_VENDOR_ID_HYGON)
continue;
if (hb_probes[i].device == device) {
@@ -390,7 +391,8 @@ static int __init pci_io_ecs_init(void)
static int __init amd_postcore_init(void)
{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
early_root_info_init();
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 13f4485ca388..30a5111ae5fd 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -629,17 +629,11 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff);
static void quirk_no_aersid(struct pci_dev *pdev)
{
/* VMD Domain */
- if (is_vmd(pdev->bus))
+ if (is_vmd(pdev->bus) && pci_is_root_bus(pdev->bus))
pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334a, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334b, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334c, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334d, quirk_no_aersid);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, quirk_no_aersid);
#ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 034813d4ab1e..6cb6076223ba 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -115,7 +115,7 @@ static struct dentry *punit_dbg_file;
static int punit_dbgfs_register(struct punit_device *punit_device)
{
- static struct dentry *dev_state;
+ struct dentry *dev_state;
punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
if (!punit_dbg_file)
@@ -143,8 +143,8 @@ static void punit_dbgfs_unregister(void)
(kernel_ulong_t)&drv_data }
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
{}
};
diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
index 5fdacb322ceb..7476b3b097e1 100644
--- a/arch/x86/platform/efi/early_printk.c
+++ b/arch/x86/platform/efi/early_printk.c
@@ -26,12 +26,14 @@ static bool early_efi_keep;
*/
static __init int early_efi_map_fb(void)
{
- unsigned long base, size;
+ u64 base, size;
if (!early_efi_keep)
return 0;
base = boot_params.screen_info.lfb_base;
+ if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)boot_params.screen_info.ext_lfb_base << 32;
size = boot_params.screen_info.lfb_size;
efi_fb = ioremap(base, size);
@@ -46,9 +48,11 @@ early_initcall(early_efi_map_fb);
*/
static __ref void *early_efi_map(unsigned long start, unsigned long len)
{
- unsigned long base;
+ u64 base;
base = boot_params.screen_info.lfb_base;
+ if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)boot_params.screen_info.ext_lfb_base << 32;
if (efi_fb)
return (efi_fb + start);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 324b93328b37..9959657127f4 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -85,12 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void)
void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-
+ load_fixmap_gdt(0);
load_cr3(save_pgd);
__flush_tlb_all();
}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ee5d08f25ce4..e8da7f492970 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -619,18 +619,16 @@ void __init efi_dump_pagetable(void)
/*
* Makes the calling thread switch to/from efi_mm context. Can be used
- * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well
- * as during efi runtime calls i.e current->active_mm == current_mm.
- * We are not mm_dropping()/mm_grabbing() any mm, because we are not
- * losing/creating any references.
+ * in a kernel thread and user context. Preemption needs to remain disabled
+ * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm
+ * can not change under us.
+ * It should be ensured that there are no concurent calls to this function.
*/
void efi_switch_mm(struct mm_struct *mm)
{
- task_lock(current);
efi_scratch.prev_mm = current->active_mm;
current->active_mm = mm;
switch_mm(efi_scratch.prev_mm, mm, NULL);
- task_unlock(current);
}
#ifdef CONFIG_EFI_MIXED
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 844d31cb8a0c..669babcaf245 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -16,6 +16,7 @@
#include <asm/efi.h>
#include <asm/uv/uv.h>
#include <asm/cpu_device_id.h>
+#include <asm/reboot.h>
#define EFI_MIN_RESERVE 5120
@@ -654,3 +655,80 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
}
#endif
+
+/*
+ * If any access by any efi runtime service causes a page fault, then,
+ * 1. If it's efi_reset_system(), reboot through BIOS.
+ * 2. If any other efi runtime service, then
+ * a. Return error status to the efi caller process.
+ * b. Disable EFI Runtime Services forever and
+ * c. Freeze efi_rts_wq and schedule new process.
+ *
+ * @return: Returns, if the page fault is not handled. This function
+ * will never return if the page fault is handled successfully.
+ */
+void efi_recover_from_page_fault(unsigned long phys_addr)
+{
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /*
+ * Make sure that an efi runtime service caused the page fault.
+ * "efi_mm" cannot be used to check if the page fault had occurred
+ * in the firmware context because efi=old_map doesn't use efi_pgd.
+ */
+ if (efi_rts_work.efi_rts_id == NONE)
+ return;
+
+ /*
+ * Address range 0x0000 - 0x0fff is always mapped in the efi_pgd, so
+ * page faulting on these addresses isn't expected.
+ */
+ if (phys_addr >= 0x0000 && phys_addr <= 0x0fff)
+ return;
+
+ /*
+ * Print stack trace as it might be useful to know which EFI Runtime
+ * Service is buggy.
+ */
+ WARN(1, FW_BUG "Page fault caused by firmware at PA: 0x%lx\n",
+ phys_addr);
+
+ /*
+ * Buggy efi_reset_system() is handled differently from other EFI
+ * Runtime Services as it doesn't use efi_rts_wq. Although,
+ * native_machine_emergency_restart() says that machine_real_restart()
+ * could fail, it's better not to compilcate this fault handler
+ * because this case occurs *very* rarely and hence could be improved
+ * on a need by basis.
+ */
+ if (efi_rts_work.efi_rts_id == RESET_SYSTEM) {
+ pr_info("efi_reset_system() buggy! Reboot through BIOS\n");
+ machine_real_restart(MRR_BIOS);
+ return;
+ }
+
+ /*
+ * Before calling EFI Runtime Service, the kernel has switched the
+ * calling process to efi_mm. Hence, switch back to task_mm.
+ */
+ arch_efi_call_virt_teardown();
+
+ /* Signal error status to the efi caller process */
+ efi_rts_work.status = EFI_ABORTED;
+ complete(&efi_rts_work.efi_rts_comp);
+
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ pr_info("Froze efi_rts_wq and disabled EFI Runtime Services\n");
+
+ /*
+ * Call schedule() in an infinite loop, so that any spurious wake ups
+ * will never run efi_rts_wq again.
+ */
+ for (;;) {
+ set_current_state(TASK_IDLE);
+ schedule();
+ }
+
+ return;
+}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
index 4392c15ed9e0..dbfc5cf2aa93 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -10,7 +10,7 @@
* of the License.
*/
-#include <linux/gpio.h>
+#include <linux/gpio/machine.h>
#include <linux/platform_device.h>
#include <linux/regulator/machine.h>
#include <linux/regulator/fixed.h>
@@ -43,7 +43,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = {
* real voltage and signaling are still 1.8V.
*/
.microvolts = 2000000, /* 1.8V */
- .gpio = -EINVAL,
.startup_delay = 250 * 1000, /* 250ms */
.enable_high = 1, /* active high */
.enabled_at_boot = 0, /* disabled at boot */
@@ -58,11 +57,23 @@ static struct platform_device bcm43xx_vmmc_regulator = {
},
};
+static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
+ .dev_id = "reg-fixed-voltage.0",
+ .table = {
+ GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW),
+ {}
+ },
+};
+
static int __init bcm43xx_regulator_register(void)
{
+ struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
+ struct gpiod_lookup *lookup = table->table;
int ret;
- bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+ lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+ gpiod_add_lookup_table(table);
+
ret = platform_device_register(&bcm43xx_vmmc_regulator);
if (ret) {
pr_err("%s: vmmc regulator register failed\n", __func__);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
index 5a0483e7bf66..31dce781364c 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -68,7 +68,7 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = {
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata }
static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data),
{}
};
diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c
index a2b4efddd61a..8e7ddd7e313a 100644
--- a/arch/x86/platform/olpc/olpc-xo1-rtc.c
+++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c
@@ -16,6 +16,7 @@
#include <asm/msr.h>
#include <asm/olpc.h>
+#include <asm/x86_init.h>
static void rtc_wake_on(struct device *dev)
{
@@ -75,6 +76,8 @@ static int __init xo1_rtc_init(void)
if (r)
return r;
+ x86_platform.legacy.rtc = 0;
+
device_init_wakeup(&xo1_rtc_device.dev, 1);
return 0;
}
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
index fd39301f25ac..7e56fc74093c 100644
--- a/arch/x86/platform/ts5500/ts5500.c
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -24,7 +24,6 @@
#include <linux/kernel.h>
#include <linux/leds.h>
#include <linux/init.h>
-#include <linux/platform_data/gpio-ts5500.h>
#include <linux/platform_data/max197.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a4701389562c..37923d715741 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -7,4 +7,4 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_cpu.o := $(nostackp)
obj-$(CONFIG_PM_SLEEP) += cpu.o
-obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
+obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
new file mode 100644
index 000000000000..bcddf09b5aa3
--- /dev/null
+++ b/arch/x86/power/hibernate.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hibernation support for x86
+ *
+ * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/suspend.h>
+#include <linux/scatterlist.h>
+#include <linux/kdebug.h>
+
+#include <crypto/hash.h>
+
+#include <asm/e820/api.h>
+#include <asm/init.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mtrr.h>
+#include <asm/sections.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Address to jump to in the last phase of restore in order to get to the image
+ * kernel's text (this value is passed in the image header).
+ */
+unsigned long restore_jump_address __visible;
+unsigned long jump_address_phys;
+
+/*
+ * Value of the cr3 register from before the hibernation (this value is passed
+ * in the image header).
+ */
+unsigned long restore_cr3 __visible;
+unsigned long temp_pgt __visible;
+unsigned long relocated_restore_code __visible;
+
+/**
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn;
+ unsigned long nosave_end_pfn;
+
+ nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+ nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+
+ return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn;
+}
+
+
+#define MD5_DIGEST_SIZE 16
+
+struct restore_data_record {
+ unsigned long jump_address;
+ unsigned long jump_address_phys;
+ unsigned long cr3;
+ unsigned long magic;
+ u8 e820_digest[MD5_DIGEST_SIZE];
+};
+
+#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
+/**
+ * get_e820_md5 - calculate md5 according to given e820 table
+ *
+ * @table: the e820 table to be calculated
+ * @buf: the md5 result to be stored to
+ */
+static int get_e820_md5(struct e820_table *table, void *buf)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int size;
+ int ret = 0;
+
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(tfm))
+ return -ENOMEM;
+
+ desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+ GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto free_tfm;
+ }
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ size = offsetof(struct e820_table, entries) +
+ sizeof(struct e820_entry) * table->nr_entries;
+
+ if (crypto_shash_digest(desc, (u8 *)table, size, buf))
+ ret = -EINVAL;
+
+ kzfree(desc);
+
+free_tfm:
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+static int hibernation_e820_save(void *buf)
+{
+ return get_e820_md5(e820_table_firmware, buf);
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ int ret;
+ u8 result[MD5_DIGEST_SIZE];
+
+ memset(result, 0, MD5_DIGEST_SIZE);
+ /* If there is no digest in suspend kernel, let it go. */
+ if (!memcmp(result, buf, MD5_DIGEST_SIZE))
+ return false;
+
+ ret = get_e820_md5(e820_table_firmware, result);
+ if (ret)
+ return true;
+
+ return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
+}
+#else
+static int hibernation_e820_save(void *buf)
+{
+ return 0;
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ /* If md5 is not builtin for restore kernel, let it go. */
+ return false;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+#else
+#define RESTORE_MAGIC 0x12345678UL
+#endif
+
+/**
+ * arch_hibernation_header_save - populate the architecture specific part
+ * of a hibernation image header
+ * @addr: address to save the data at
+ */
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (max_size < sizeof(struct restore_data_record))
+ return -EOVERFLOW;
+ rdr->magic = RESTORE_MAGIC;
+ rdr->jump_address = (unsigned long)restore_registers;
+ rdr->jump_address_phys = __pa_symbol(restore_registers);
+
+ /*
+ * The restore code fixes up CR3 and CR4 in the following sequence:
+ *
+ * [in hibernation asm]
+ * 1. CR3 <= temporary page tables
+ * 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
+ * 3. CR3 <= rdr->cr3
+ * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
+ * [in restore_processor_state()]
+ * 5. CR4 <= saved CR4
+ * 6. CR3 <= saved CR3
+ *
+ * Our mmu_cr4_features has CR4.PCIDE=0, and toggling
+ * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
+ * rdr->cr3 needs to point to valid page tables but must not
+ * have any of the PCID bits set.
+ */
+ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
+
+ return hibernation_e820_save(rdr->e820_digest);
+}
+
+/**
+ * arch_hibernation_header_restore - read the architecture specific data
+ * from the hibernation image header
+ * @addr: address to read the data from
+ */
+int arch_hibernation_header_restore(void *addr)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (rdr->magic != RESTORE_MAGIC) {
+ pr_crit("Unrecognized hibernate image header format!\n");
+ return -EINVAL;
+ }
+
+ restore_jump_address = rdr->jump_address;
+ jump_address_phys = rdr->jump_address_phys;
+ restore_cr3 = rdr->cr3;
+
+ if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ pr_crit("Hibernate inconsistent memory map detected!\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+int relocate_restore_code(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ relocated_restore_code = get_safe_page(GFP_ATOMIC);
+ if (!relocated_restore_code)
+ return -ENOMEM;
+
+ memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
+
+ /* Make the page containing the relocated code executable */
+ pgd = (pgd_t *)__va(read_cr3_pa()) +
+ pgd_index(relocated_restore_code);
+ p4d = p4d_offset(pgd, relocated_restore_code);
+ if (p4d_large(*p4d)) {
+ set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+ goto out;
+ }
+ pud = pud_offset(p4d, relocated_restore_code);
+ if (pud_large(*pud)) {
+ set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
+ goto out;
+ }
+ pmd = pmd_offset(pud, relocated_restore_code);
+ if (pmd_large(*pmd)) {
+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
+ goto out;
+ }
+ pte = pte_offset_kernel(pmd, relocated_restore_code);
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
+out:
+ __flush_tlb_all();
+ return 0;
+}
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index afc4ed7b1578..15695e30f982 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -14,9 +14,7 @@
#include <asm/pgtable.h>
#include <asm/mmzone.h>
#include <asm/sections.h>
-
-/* Defined in hibernate_asm_32.S */
-extern int restore_image(void);
+#include <asm/suspend.h>
/* Pointer to the temporary resume page tables */
pgd_t *resume_pg_dir;
@@ -145,6 +143,32 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
#endif
}
+static int set_up_temporary_text_mapping(pgd_t *pgd_base)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_base + pgd_index(restore_jump_address);
+
+ pmd = resume_one_md_table_init(pgd);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (boot_cpu_has(X86_FEATURE_PSE)) {
+ set_pmd(pmd + pmd_index(restore_jump_address),
+ __pmd((jump_address_phys & PMD_MASK) | pgprot_val(PAGE_KERNEL_LARGE_EXEC)));
+ } else {
+ pte = resume_one_page_table_init(pmd);
+ if (!pte)
+ return -ENOMEM;
+ set_pte(pte + pte_index(restore_jump_address),
+ __pte((jump_address_phys & PAGE_MASK) | pgprot_val(PAGE_KERNEL_EXEC)));
+ }
+
+ return 0;
+}
+
asmlinkage int swsusp_arch_resume(void)
{
int error;
@@ -154,22 +178,22 @@ asmlinkage int swsusp_arch_resume(void)
return -ENOMEM;
resume_init_first_level_page_table(resume_pg_dir);
+
+ error = set_up_temporary_text_mapping(resume_pg_dir);
+ if (error)
+ return error;
+
error = resume_physical_mapping_init(resume_pg_dir);
if (error)
return error;
+ temp_pgt = __pa(resume_pg_dir);
+
+ error = relocate_restore_code();
+ if (error)
+ return error;
+
/* We have got enough memory and from now on we cannot recover */
restore_image();
return 0;
}
-
-/*
- * pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f8e3b668d20b..239f424ccb29 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -26,26 +26,6 @@
#include <asm/suspend.h>
#include <asm/tlbflush.h>
-/* Defined in hibernate_asm_64.S */
-extern asmlinkage __visible int restore_image(void);
-
-/*
- * Address to jump to in the last phase of restore in order to get to the image
- * kernel's text (this value is passed in the image header).
- */
-unsigned long restore_jump_address __visible;
-unsigned long jump_address_phys;
-
-/*
- * Value of the cr3 register from before the hibernation (this value is passed
- * in the image header).
- */
-unsigned long restore_cr3 __visible;
-
-unsigned long temp_level4_pgt __visible;
-
-unsigned long relocated_restore_code __visible;
-
static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
@@ -141,46 +121,7 @@ static int set_up_temporary_mappings(void)
return result;
}
- temp_level4_pgt = __pa(pgd);
- return 0;
-}
-
-static int relocate_restore_code(void)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- relocated_restore_code = get_safe_page(GFP_ATOMIC);
- if (!relocated_restore_code)
- return -ENOMEM;
-
- memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
-
- /* Make the page containing the relocated code executable */
- pgd = (pgd_t *)__va(read_cr3_pa()) +
- pgd_index(relocated_restore_code);
- p4d = p4d_offset(pgd, relocated_restore_code);
- if (p4d_large(*p4d)) {
- set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
- goto out;
- }
- pud = pud_offset(p4d, relocated_restore_code);
- if (pud_large(*pud)) {
- set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
- goto out;
- }
- pmd = pmd_offset(pud, relocated_restore_code);
- if (pmd_large(*pmd)) {
- set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
- goto out;
- }
- pte = pte_offset_kernel(pmd, relocated_restore_code);
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
-out:
- __flush_tlb_all();
+ temp_pgt = __pa(pgd);
return 0;
}
@@ -200,166 +141,3 @@ asmlinkage int swsusp_arch_resume(void)
restore_image();
return 0;
}
-
-/*
- * pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-#define MD5_DIGEST_SIZE 16
-
-struct restore_data_record {
- unsigned long jump_address;
- unsigned long jump_address_phys;
- unsigned long cr3;
- unsigned long magic;
- u8 e820_digest[MD5_DIGEST_SIZE];
-};
-
-#define RESTORE_MAGIC 0x23456789ABCDEF01UL
-
-#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
-/**
- * get_e820_md5 - calculate md5 according to given e820 table
- *
- * @table: the e820 table to be calculated
- * @buf: the md5 result to be stored to
- */
-static int get_e820_md5(struct e820_table *table, void *buf)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int size;
- int ret = 0;
-
- tfm = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(tfm))
- return -ENOMEM;
-
- desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
- GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto free_tfm;
- }
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- size = offsetof(struct e820_table, entries) +
- sizeof(struct e820_entry) * table->nr_entries;
-
- if (crypto_shash_digest(desc, (u8 *)table, size, buf))
- ret = -EINVAL;
-
- kzfree(desc);
-
-free_tfm:
- crypto_free_shash(tfm);
- return ret;
-}
-
-static void hibernation_e820_save(void *buf)
-{
- get_e820_md5(e820_table_firmware, buf);
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- int ret;
- u8 result[MD5_DIGEST_SIZE];
-
- memset(result, 0, MD5_DIGEST_SIZE);
- /* If there is no digest in suspend kernel, let it go. */
- if (!memcmp(result, buf, MD5_DIGEST_SIZE))
- return false;
-
- ret = get_e820_md5(e820_table_firmware, result);
- if (ret)
- return true;
-
- return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
-}
-#else
-static void hibernation_e820_save(void *buf)
-{
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- /* If md5 is not builtin for restore kernel, let it go. */
- return false;
-}
-#endif
-
-/**
- * arch_hibernation_header_save - populate the architecture specific part
- * of a hibernation image header
- * @addr: address to save the data at
- */
-int arch_hibernation_header_save(void *addr, unsigned int max_size)
-{
- struct restore_data_record *rdr = addr;
-
- if (max_size < sizeof(struct restore_data_record))
- return -EOVERFLOW;
- rdr->jump_address = (unsigned long)restore_registers;
- rdr->jump_address_phys = __pa_symbol(restore_registers);
-
- /*
- * The restore code fixes up CR3 and CR4 in the following sequence:
- *
- * [in hibernation asm]
- * 1. CR3 <= temporary page tables
- * 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
- * 3. CR3 <= rdr->cr3
- * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
- * [in restore_processor_state()]
- * 5. CR4 <= saved CR4
- * 6. CR3 <= saved CR3
- *
- * Our mmu_cr4_features has CR4.PCIDE=0, and toggling
- * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
- * rdr->cr3 needs to point to valid page tables but must not
- * have any of the PCID bits set.
- */
- rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
-
- rdr->magic = RESTORE_MAGIC;
-
- hibernation_e820_save(rdr->e820_digest);
-
- return 0;
-}
-
-/**
- * arch_hibernation_header_restore - read the architecture specific data
- * from the hibernation image header
- * @addr: address to read the data from
- */
-int arch_hibernation_header_restore(void *addr)
-{
- struct restore_data_record *rdr = addr;
-
- restore_jump_address = rdr->jump_address;
- jump_address_phys = rdr->jump_address_phys;
- restore_cr3 = rdr->cr3;
-
- if (rdr->magic != RESTORE_MAGIC) {
- pr_crit("Unrecognized hibernate image header format!\n");
- return -EINVAL;
- }
-
- if (hibernation_e820_mismatch(rdr->e820_digest)) {
- pr_crit("Hibernate inconsistent memory map detected!\n");
- return -ENODEV;
- }
-
- return 0;
-}
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 6e56815e13a0..6fe383002125 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -12,6 +12,7 @@
#include <asm/page_types.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/frame.h>
.text
@@ -24,13 +25,30 @@ ENTRY(swsusp_arch_suspend)
pushfl
popl saved_context_eflags
+ /* save cr3 */
+ movl %cr3, %eax
+ movl %eax, restore_cr3
+
+ FRAME_BEGIN
call swsusp_save
+ FRAME_END
ret
+ENDPROC(swsusp_arch_suspend)
ENTRY(restore_image)
+ /* prepare to jump to the image kernel */
+ movl restore_jump_address, %ebx
+ movl restore_cr3, %ebp
+
movl mmu_cr4_features, %ecx
- movl resume_pg_dir, %eax
- subl $__PAGE_OFFSET, %eax
+
+ /* jump to relocated restore code */
+ movl relocated_restore_code, %eax
+ jmpl *%eax
+
+/* code below has been relocated to a safe page */
+ENTRY(core_restore_code)
+ movl temp_pgt, %eax
movl %eax, %cr3
jecxz 1f # cr4 Pentium and higher, skip if zero
@@ -49,7 +67,7 @@ copy_loop:
movl pbe_address(%edx), %esi
movl pbe_orig_address(%edx), %edi
- movl $1024, %ecx
+ movl $(PAGE_SIZE >> 2), %ecx
rep
movsl
@@ -58,10 +76,13 @@ copy_loop:
.p2align 4,,7
done:
+ jmpl *%ebx
+
+ /* code below belongs to the image kernel */
+ .align PAGE_SIZE
+ENTRY(restore_registers)
/* go back to the original page tables */
- movl $swapper_pg_dir, %eax
- subl $__PAGE_OFFSET, %eax
- movl %eax, %cr3
+ movl %ebp, %cr3
movl mmu_cr4_features, %ecx
jecxz 1f # cr4 Pentium and higher, skip if zero
movl %ecx, %cr4; # turn PGE back on
@@ -82,4 +103,8 @@ done:
xorl %eax, %eax
+ /* tell the hibernation core that we've just restored the memory */
+ movl %eax, in_suspend
+
ret
+ENDPROC(restore_registers)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index fd369a6e9ff8..3008baa2fa95 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -59,7 +59,7 @@ ENTRY(restore_image)
movq restore_cr3(%rip), %r9
/* prepare to switch to temporary page tables */
- movq temp_level4_pgt(%rip), %rax
+ movq temp_pgt(%rip), %rax
movq mmu_cr4_features(%rip), %rbx
/* prepare to copy image data to their original locations */
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 3a6c8ebc8032..0b08067c45f3 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -196,6 +196,7 @@ static const char *rel_type(unsigned type)
#if ELF_BITS == 64
REL_TYPE(R_X86_64_NONE),
REL_TYPE(R_X86_64_64),
+ REL_TYPE(R_X86_64_PC64),
REL_TYPE(R_X86_64_PC32),
REL_TYPE(R_X86_64_GOT32),
REL_TYPE(R_X86_64_PLT32),
@@ -782,6 +783,15 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
add_reloc(&relocs32neg, offset);
break;
+ case R_X86_64_PC64:
+ /*
+ * Only used by jump labels
+ */
+ if (is_percpu_sym(sym, symname))
+ die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n",
+ symname);
+ break;
+
case R_X86_64_32:
case R_X86_64_32S:
case R_X86_64_64:
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 548197212a45..413f3519d9a1 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -116,8 +116,7 @@ do { \
#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
#define R_X86_64_8 14 /* Direct 8 bit sign extended */
#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
-
-#define R_X86_64_NUM 16
+#define R_X86_64_PC64 24 /* Place relative 64-bit signed */
/*
* This is used to ensure we don't load something for the wrong architecture.
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c1f98f32c45f..1ef391aa184d 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -18,6 +18,7 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
+ select PARAVIRT_XXL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
help
@@ -68,7 +69,6 @@ config XEN_SAVE_RESTORE
config XEN_DEBUG_FS
bool "Enable Xen debug and tuning parameters in debugfs"
depends on XEN && DEBUG_FS
- default n
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index d83cb5478f54..dd2550d33b38 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,25 +12,46 @@ endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten_pv.o := $(nostackp)
-CFLAGS_mmu_pv.o := $(nostackp)
+CFLAGS_mmu_pv.o := $(nostackp)
-obj-y := enlighten.o multicalls.o mmu.o irq.o \
- time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o platform-pci-unplug.o
+obj-y += enlighten.o
+obj-y += mmu.o
+obj-y += time.o
+obj-y += grant-table.o
+obj-y += suspend.o
-obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o
-obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \
- p2m.o enlighten_pv.o mmu_pv.o
-obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o
+obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o
+obj-$(CONFIG_XEN_PVHVM) += mmu_hvm.o
+obj-$(CONFIG_XEN_PVHVM) += suspend_hvm.o
+obj-$(CONFIG_XEN_PVHVM) += platform-pci-unplug.o
-obj-$(CONFIG_EVENT_TRACING) += trace.o
+obj-$(CONFIG_XEN_PV) += setup.o
+obj-$(CONFIG_XEN_PV) += apic.o
+obj-$(CONFIG_XEN_PV) += pmu.o
+obj-$(CONFIG_XEN_PV) += suspend_pv.o
+obj-$(CONFIG_XEN_PV) += p2m.o
+obj-$(CONFIG_XEN_PV) += enlighten_pv.o
+obj-$(CONFIG_XEN_PV) += mmu_pv.o
+obj-$(CONFIG_XEN_PV) += irq.o
+obj-$(CONFIG_XEN_PV) += multicalls.o
+obj-$(CONFIG_XEN_PV) += xen-asm.o
+obj-$(CONFIG_XEN_PV) += xen-asm_$(BITS).o
+
+obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o
+obj-$(CONFIG_XEN_PVH) += xen-pvh.o
+
+obj-$(CONFIG_EVENT_TRACING) += trace.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o
obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o
+
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+
obj-$(CONFIG_XEN_DOM0) += vga.o
+
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+
obj-$(CONFIG_XEN_EFI) += efi.o
-obj-$(CONFIG_XEN_PVH) += xen-pvh.o
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 1804b27f9632..1fbb629a9d78 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Oracle Co., Daniel Kiper
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/bitops.h>
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2eeddd814653..67b2f31a1265 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
#include <linux/bootmem.h>
#endif
@@ -5,6 +7,7 @@
#include <linux/kexec.h>
#include <linux/slab.h>
+#include <xen/xen.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/interface/memory.h>
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 19c1ff542387..0e75642d42a3 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 52a7c3faee0c..ec7a4209f310 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -995,11 +995,14 @@ void __init xen_setup_vcpu_info_placement(void)
* percpu area for all cpus, so make use of it.
*/
if (xen_have_vcpu_info_placement) {
- pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
- pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
- pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
- pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+ pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_ops.irq.restore_fl =
+ __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ pv_ops.irq.irq_disable =
+ __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_ops.irq.irq_enable =
+ __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
+ pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
}
}
@@ -1174,14 +1177,14 @@ static void __init xen_boot_params_init_edd(void)
*/
static void __init xen_setup_gdt(int cpu)
{
- pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
- pv_cpu_ops.load_gdt = xen_load_gdt_boot;
+ pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_ops.cpu.load_gdt = xen_load_gdt_boot;
setup_stack_canary_segment(cpu);
switch_to_new_gdt(cpu);
- pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
- pv_cpu_ops.load_gdt = xen_load_gdt;
+ pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
+ pv_ops.cpu.load_gdt = xen_load_gdt;
}
static void __init xen_dom0_set_legacy_features(void)
@@ -1206,8 +1209,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_init_ops.patch = paravirt_patch_default;
- pv_cpu_ops = xen_cpu_ops;
+ pv_ops.init.patch = paravirt_patch_default;
+ pv_ops.cpu = xen_cpu_ops;
xen_init_irq_ops();
/*
@@ -1276,8 +1279,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
- pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
- pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+ pv_ops.mmu.ptep_modify_prot_start =
+ xen_ptep_modify_prot_start;
+ pv_ops.mmu.ptep_modify_prot_commit =
+ xen_ptep_modify_prot_commit;
}
machine_ops = xen_machine_ops;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index c85d1a88f476..02e3ab7ff242 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -11,6 +11,7 @@
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/interface/memory.h>
#include <xen/interface/hvm/start_info.h>
@@ -75,7 +76,7 @@ static void __init init_pvh_bootparams(void)
* Version 2.12 supports Xen entry point but we will use default x86/PC
* environment (i.e. hardware_subarch 0).
*/
- pvh_bootparams.hdr.version = 0x212;
+ pvh_bootparams.hdr.version = (2 << 8) | 12;
pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
x86_init.acpi.get_root_pointer = pvh_get_root_pointer;
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 92ccc718152d..ecb0d5450334 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/******************************************************************************
* grant_table.c
* x86 specific part
@@ -8,30 +9,6 @@
* Copyright (c) 2004-2005, K A Fraser
* Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
* VA Linux Systems Japan. Split out x86 specific part.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
*/
#include <linux/sched.h>
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 7515a19fd324..850c93f346c7 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -128,6 +128,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
void __init xen_init_irq_ops(void)
{
- pv_irq_ops = xen_irq_ops;
+ pv_ops.irq = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 96fc2f0fdbfe..60e9c37fd79f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
#include <linux/pfn.h>
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
@@ -6,12 +8,6 @@
#include "multicalls.h"
#include "mmu.h"
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and balloon lists.
- */
-DEFINE_SPINLOCK(xen_reservation_lock);
-
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -42,186 +38,6 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
}
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
-static noinline void xen_flush_tlb_all(void)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_TLB_FLUSH_ALL;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
-
-#define REMAP_BATCH_SIZE 16
-
-struct remap_data {
- xen_pfn_t *pfn;
- bool contiguous;
- bool no_translate;
- pgprot_t prot;
- struct mmu_update *mmu_update;
-};
-
-static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
- unsigned long addr, void *data)
-{
- struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
-
- /*
- * If we have a contiguous range, just update the pfn itself,
- * else update pointer to be "next pfn".
- */
- if (rmd->contiguous)
- (*rmd->pfn)++;
- else
- rmd->pfn++;
-
- rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
- rmd->mmu_update->ptr |= rmd->no_translate ?
- MMU_PT_UPDATE_NO_TRANSLATE :
- MMU_NORMAL_PT_UPDATE;
- rmd->mmu_update->val = pte_val_ma(pte);
- rmd->mmu_update++;
-
- return 0;
-}
-
-static int do_remap_pfn(struct vm_area_struct *vma,
- unsigned long addr,
- xen_pfn_t *pfn, int nr,
- int *err_ptr, pgprot_t prot,
- unsigned int domid,
- bool no_translate,
- struct page **pages)
-{
- int err = 0;
- struct remap_data rmd;
- struct mmu_update mmu_update[REMAP_BATCH_SIZE];
- unsigned long range;
- int mapped = 0;
-
- BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
-
- rmd.pfn = pfn;
- rmd.prot = prot;
- /*
- * We use the err_ptr to indicate if there we are doing a contiguous
- * mapping or a discontigious mapping.
- */
- rmd.contiguous = !err_ptr;
- rmd.no_translate = no_translate;
-
- while (nr) {
- int index = 0;
- int done = 0;
- int batch = min(REMAP_BATCH_SIZE, nr);
- int batch_left = batch;
- range = (unsigned long)batch << PAGE_SHIFT;
-
- rmd.mmu_update = mmu_update;
- err = apply_to_page_range(vma->vm_mm, addr, range,
- remap_area_pfn_pte_fn, &rmd);
- if (err)
- goto out;
-
- /* We record the error for each page that gives an error, but
- * continue mapping until the whole set is done */
- do {
- int i;
-
- err = HYPERVISOR_mmu_update(&mmu_update[index],
- batch_left, &done, domid);
-
- /*
- * @err_ptr may be the same buffer as @gfn, so
- * only clear it after each chunk of @gfn is
- * used.
- */
- if (err_ptr) {
- for (i = index; i < index + done; i++)
- err_ptr[i] = 0;
- }
- if (err < 0) {
- if (!err_ptr)
- goto out;
- err_ptr[i] = err;
- done++; /* Skip failed frame. */
- } else
- mapped += done;
- batch_left -= done;
- index += done;
- } while (batch_left);
-
- nr -= batch;
- addr += range;
- if (err_ptr)
- err_ptr += batch;
- cond_resched();
- }
-out:
-
- xen_flush_tlb_all();
-
- return err < 0 ? err : mapped;
-}
-
-int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
- unsigned long addr,
- xen_pfn_t gfn, int nr,
- pgprot_t prot, unsigned domid,
- struct page **pages)
-{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -EOPNOTSUPP;
-
- return do_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false,
- pages);
-}
-EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
-
-int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
- unsigned long addr,
- xen_pfn_t *gfn, int nr,
- int *err_ptr, pgprot_t prot,
- unsigned domid, struct page **pages)
-{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
- prot, domid, pages);
-
- /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
- * and the consequences later is quite hard to detect what the actual
- * cause of "wrong memory was mapped in".
- */
- BUG_ON(err_ptr == NULL);
- return do_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid,
- false, pages);
-}
-EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
-
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
- unsigned long addr,
- xen_pfn_t *mfn, int nr,
- int *err_ptr, pgprot_t prot,
- unsigned int domid, struct page **pages)
-{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -EOPNOTSUPP;
-
- return do_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid,
- true, pages);
-}
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
-
/* Returns: 0 success */
int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
int nr, struct page **pages)
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index dd2ad82eee80..57409373750f 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -73,7 +73,7 @@ static int is_pagetable_dying_supported(void)
void __init xen_hvm_init_mmu_ops(void)
{
if (is_pagetable_dying_supported())
- pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+ pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap;
#ifdef CONFIG_PROC_VMCORE
WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
#endif
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 45b700ac5fe7..70ea598a37d2 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
/*
* Xen mmu operations
*
@@ -99,6 +101,12 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
#endif /* CONFIG_X86_64 */
/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and balloon lists.
+ */
+static DEFINE_SPINLOCK(xen_reservation_lock);
+
+/*
* Note about cr3 (pagetable base) values:
*
* xen_cr3 contains the current logical cr3 value; it contains the
@@ -435,14 +443,13 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
trace_xen_mmu_set_pte_atomic(ptep, pte);
- set_64bit((u64 *)ptep, native_pte_val(pte));
+ __xen_set_pte(ptep, pte);
}
static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
trace_xen_mmu_pte_clear(mm, addr, ptep);
- if (!xen_batched_set_pte(ptep, native_make_pte(0)))
- native_pte_clear(mm, addr, ptep);
+ __xen_set_pte(ptep, native_make_pte(0));
}
static void xen_pmd_clear(pmd_t *pmdp)
@@ -1570,7 +1577,7 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
#endif
- native_set_pte(ptep, pte);
+ __xen_set_pte(ptep, pte);
}
/* Early in boot, while setting up the initial pagetable, assume
@@ -1908,7 +1915,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* L3_k[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
- /* L3_k[511][506] -> level1_fixmap_pgt */
+ /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
convert_pfn_mfn(level2_fixmap_pgt);
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
@@ -1953,7 +1960,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
+
+ for (i = 0; i < FIXMAP_PMD_NUM; i++) {
+ set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
+ PAGE_KERNEL_RO);
+ }
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
@@ -2061,7 +2072,6 @@ void __init xen_relocate_p2m(void)
pud_t *pud;
pgd_t *pgd;
unsigned long *new_p2m;
- int save_pud;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
@@ -2091,7 +2101,6 @@ void __init xen_relocate_p2m(void)
pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- save_pud = n_pud;
for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
pud = early_memremap(pud_phys, PAGE_SIZE);
clear_page(pud);
@@ -2208,7 +2217,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
set_page_prot(initial_page_table, PAGE_KERNEL);
set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
- pv_mmu_ops.write_cr3 = &xen_write_cr3;
+ pv_ops.mmu.write_cr3 = &xen_write_cr3;
}
/*
@@ -2357,27 +2366,27 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
static void __init xen_post_allocator_init(void)
{
- pv_mmu_ops.set_pte = xen_set_pte;
- pv_mmu_ops.set_pmd = xen_set_pmd;
- pv_mmu_ops.set_pud = xen_set_pud;
+ pv_ops.mmu.set_pte = xen_set_pte;
+ pv_ops.mmu.set_pmd = xen_set_pmd;
+ pv_ops.mmu.set_pud = xen_set_pud;
#ifdef CONFIG_X86_64
- pv_mmu_ops.set_p4d = xen_set_p4d;
+ pv_ops.mmu.set_p4d = xen_set_p4d;
#endif
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
- pv_mmu_ops.alloc_pte = xen_alloc_pte;
- pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
- pv_mmu_ops.release_pte = xen_release_pte;
- pv_mmu_ops.release_pmd = xen_release_pmd;
+ pv_ops.mmu.alloc_pte = xen_alloc_pte;
+ pv_ops.mmu.alloc_pmd = xen_alloc_pmd;
+ pv_ops.mmu.release_pte = xen_release_pte;
+ pv_ops.mmu.release_pmd = xen_release_pmd;
#ifdef CONFIG_X86_64
- pv_mmu_ops.alloc_pud = xen_alloc_pud;
- pv_mmu_ops.release_pud = xen_release_pud;
+ pv_ops.mmu.alloc_pud = xen_alloc_pud;
+ pv_ops.mmu.release_pud = xen_release_pud;
#endif
- pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
+ pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte);
#ifdef CONFIG_X86_64
- pv_mmu_ops.write_cr3 = &xen_write_cr3;
+ pv_ops.mmu.write_cr3 = &xen_write_cr3;
#endif
}
@@ -2465,7 +2474,7 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_init = xen_pagetable_init;
x86_init.hyper.init_after_bootmem = xen_after_bootmem;
- pv_mmu_ops = xen_mmu_ops;
+ pv_ops.mmu = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
@@ -2665,6 +2674,138 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
}
EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+static noinline void xen_flush_tlb_all(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_ALL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ xen_pfn_t *pfn;
+ bool contiguous;
+ bool no_translate;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
+
+ /*
+ * If we have a contiguous range, just update the pfn itself,
+ * else update pointer to be "next pfn".
+ */
+ if (rmd->contiguous)
+ (*rmd->pfn)++;
+ else
+ rmd->pfn++;
+
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr |= rmd->no_translate ?
+ MMU_PT_UPDATE_NO_TRANSLATE :
+ MMU_NORMAL_PT_UPDATE;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
+ xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
+ unsigned int domid, bool no_translate, struct page **pages)
+{
+ int err = 0;
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ unsigned long range;
+ int mapped = 0;
+
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+
+ rmd.pfn = pfn;
+ rmd.prot = prot;
+ /*
+ * We use the err_ptr to indicate if there we are doing a contiguous
+ * mapping or a discontigious mapping.
+ */
+ rmd.contiguous = !err_ptr;
+ rmd.no_translate = no_translate;
+
+ while (nr) {
+ int index = 0;
+ int done = 0;
+ int batch = min(REMAP_BATCH_SIZE, nr);
+ int batch_left = batch;
+
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_pfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ /*
+ * We record the error for each page that gives an error, but
+ * continue mapping until the whole set is done
+ */
+ do {
+ int i;
+
+ err = HYPERVISOR_mmu_update(&mmu_update[index],
+ batch_left, &done, domid);
+
+ /*
+ * @err_ptr may be the same buffer as @gfn, so
+ * only clear it after each chunk of @gfn is
+ * used.
+ */
+ if (err_ptr) {
+ for (i = index; i < index + done; i++)
+ err_ptr[i] = 0;
+ }
+ if (err < 0) {
+ if (!err_ptr)
+ goto out;
+ err_ptr[i] = err;
+ done++; /* Skip failed frame. */
+ } else
+ mapped += done;
+ batch_left -= done;
+ index += done;
+ } while (batch_left);
+
+ nr -= batch;
+ addr += range;
+ if (err_ptr)
+ err_ptr += batch;
+ cond_resched();
+ }
+out:
+
+ xen_flush_tlb_all();
+
+ return err < 0 ? err : mapped;
+}
+EXPORT_SYMBOL_GPL(xen_remap_pfn);
+
#ifdef CONFIG_KEXEC_CORE
phys_addr_t paddr_vmcoreinfo_note(void)
{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 159a897151d6..d6d74efd8912 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
/*
* Xen leaves the responsibility for maintaining p2m mappings to the
* guests themselves, but it must also access and update the p2m array
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 37c6056a7bba..33293ce01d8d 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
/* Glue code to lib/swiotlb-xen.c */
#include <linux/dma-mapping.h>
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 33a783c77d96..66ab96a4e2b3 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -1,28 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+
/******************************************************************************
* platform-pci-unplug.c
*
* Xen platform PCI device driver
* Copyright (c) 2010, Citrix
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
*/
#include <linux/init.h>
#include <linux/io.h>
#include <linux/export.h>
+#include <xen/xen.h>
#include <xen/platform_pci.h>
#include "xen-ops.h"
@@ -30,7 +19,6 @@
#define XEN_PLATFORM_ERR_PROTOCOL -2
#define XEN_PLATFORM_ERR_BLACKLIST -3
-#ifdef CONFIG_XEN_PVHVM
/* store the value of xen_emul_unplug after the unplug is done */
static int xen_platform_pci_unplug;
static int xen_emul_unplug;
@@ -214,4 +202,3 @@ static int __init parse_xen_emul_unplug(char *arg)
return 0;
}
early_param("xen_emul_unplug", parse_xen_emul_unplug);
-#endif
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 7d00d4ad44d4..e13b0b49fcdf 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -3,6 +3,7 @@
#include <linux/interrupt.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
@@ -90,6 +91,12 @@ static void xen_pmu_arch_init(void)
k7_counters_mirrored = 0;
break;
}
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ amd_num_counters = F10H_NUM_COUNTERS;
+ amd_counters_base = MSR_K7_PERFCTR0;
+ amd_ctrls_base = MSR_K7_EVNTSEL0;
+ amd_msr_step = 1;
+ k7_counters_mirrored = 0;
} else {
uint32_t eax, ebx, ecx, edx;
@@ -285,7 +292,7 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, val, 1))
*val = native_read_msr_safe(msr, err);
@@ -308,7 +315,7 @@ bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
{
uint64_t val = ((uint64_t)high << 32) | low;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, &val, 0))
*err = native_write_msr_safe(msr, low, high);
@@ -379,7 +386,7 @@ static unsigned long long xen_intel_read_pmc(int counter)
unsigned long long xen_read_pmc(int counter)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return xen_amd_read_pmc(counter);
else
return xen_intel_read_pmc(counter);
@@ -478,7 +485,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
{
int err, ret = IRQ_NONE;
- struct pt_regs regs;
+ struct pt_regs regs = {0};
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index e3b18ad49889..145506f9fdbe 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/nmi.h>
#include <linux/cpuhotplug.h>
+#include <linux/stackprotector.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
+ boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 973f10e05211..23f6793af88a 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -141,11 +141,12 @@ void __init xen_init_spinlocks(void)
printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
__pv_init_lock_hash();
- pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
- pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
- pv_lock_ops.wait = xen_qlock_wait;
- pv_lock_ops.kick = xen_qlock_kick;
- pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = xen_qlock_wait;
+ pv_ops.lock.kick = xen_qlock_kick;
+ pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
}
static __init int xen_parse_nopvspin(char *arg)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c84f1e039d84..72bf446c3fee 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -513,7 +513,7 @@ static void __init xen_time_init(void)
void __init xen_init_time_ops(void)
{
xen_sched_clock_offset = xen_clocksource_read();
- pv_time_ops = xen_time_ops;
+ pv_ops.time = xen_time_ops;
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
@@ -555,7 +555,7 @@ void __init xen_hvm_init_time_ops(void)
}
xen_sched_clock_offset = xen_clocksource_read();
- pv_time_ops = xen_time_ops;
+ pv_ops.time = xen_time_ops;
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
index 861fedfe5230..873c54c488fe 100644
--- a/arch/x86/xen/vdso.h
+++ b/arch/x86/xen/vdso.h
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 417b339e5c8e..bb1c2da0381d 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -91,13 +91,15 @@ ENTRY(xen_iret)
ENTRY(xen_sysret64)
/*
* We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back
+ * still with the kernel gs, so we can easily switch back.
+ *
+ * tss.sp2 is scratch space.
*/
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq $__USER_DS
- pushq PER_CPU_VAR(rsp_scratch)
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
pushq %r11
pushq $__USER_CS
pushq %rcx
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index ca2d3b2bf2af..b0e471506cd8 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
/*
* Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program. If not, see <http://www.gnu.org/licenses/>.
*/
.code32