diff options
Diffstat (limited to 'arch/x86')
123 files changed, 2201 insertions, 1031 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f492b871a..1fe24b624d44 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,8 @@ config X86_64 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE + select SWIOTLB select X86_DEV_DMA_OPS select ARCH_HAS_SYSCALL_WRAPPER @@ -60,6 +62,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_UACCESS_MCSAFE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX @@ -134,7 +137,6 @@ config X86 select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW - select HAVE_DMA_API_DEBUG select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -184,6 +186,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING + select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG select PERF_EVENTS select RTC_LIB @@ -236,13 +239,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX config SBUS bool -config NEED_DMA_MAP_STATE - def_bool y - depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB - -config NEED_SG_DMA_LENGTH - def_bool y - config GENERIC_ISA_DMA def_bool y depends on ISA_DMA_API @@ -875,6 +871,7 @@ config DMI config GART_IOMMU bool "Old AMD GART IOMMU support" + select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI && AMD_NB ---help--- @@ -896,6 +893,7 @@ config GART_IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" + select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI ---help--- @@ -923,20 +921,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. -# need this always selected by IOMMU for the VIA workaround -config SWIOTLB - def_bool y if X86_64 - ---help--- - Support for software bounce buffers used on x86-64 systems - which don't have a hardware IOMMU. Using this PCI devices - which can only access 32-bits of memory can be used on systems - with more than 3 GB of memory. - If unsure, say Y. - -config IOMMU_HELPER - def_bool y - depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU - config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL @@ -1458,6 +1442,7 @@ config HIGHMEM config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G + select PHYS_ADDR_T_64BIT select SWIOTLB ---help--- PAE is required for NX support, and furthermore enables @@ -1485,14 +1470,6 @@ config X86_5LEVEL Say N if unsure. -config ARCH_PHYS_ADDR_T_64BIT - def_bool y - depends on X86_64 || X86_PAE - -config ARCH_DMA_ADDR_T_64BIT - def_bool y - depends on X86_64 || HIGHMEM64G - config X86_DIRECT_GBPAGES def_bool y depends on X86_64 && !DEBUG_PAGEALLOC diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 0cb325734cfb..af6cda0b7900 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE +#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL static unsigned long fs; static inline void set_fs(unsigned long seg) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 47d3efff6805..a8a8642d2b0b 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -109,23 +109,34 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } static efi_status_t -__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) +__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) { struct pci_setup_rom *rom = NULL; efi_status_t status; unsigned long size; - uint64_t attributes; + uint64_t attributes, romsize; + void *romimage; - status = efi_early->call(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, 0, - &attributes); + status = efi_call_proto(efi_pci_io_protocol, attributes, pci, + EfiPciIoAttributeOperationGet, 0, 0, + &attributes); if (status != EFI_SUCCESS) return status; - if (!pci->romimage || !pci->romsize) + /* + * Some firmware images contain EFI function pointers at the place where the + * romimage and romsize fields are supposed to be. Typically the EFI + * code is mapped at high addresses, translating to an unrealistically + * large romsize. The UEFI spec limits the size of option ROMs to 16 + * MiB so we reject any ROMs over 16 MiB in size to catch this. + */ + romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol, + romimage, pci); + romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci); + if (!romimage || !romsize || romsize > SZ_16M) return EFI_INVALID_PARAMETER; - size = pci->romsize + sizeof(*rom); + size = romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); if (status != EFI_SUCCESS) { @@ -141,29 +152,32 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) rom->pcilen = pci->romsize; *__rom = rom; - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_VENDOR_ID, 1, &(rom->vendor)); + status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, + EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, + &rom->vendor); if (status != EFI_SUCCESS) { efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; } - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_DEVICE_ID, 1, &(rom->devid)); + status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, + EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, + &rom->devid); if (status != EFI_SUCCESS) { efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; } - status = efi_early->call(pci->get_location, pci, &(rom->segment), - &(rom->bus), &(rom->device), &(rom->function)); + status = efi_call_proto(efi_pci_io_protocol, get_location, pci, + &rom->segment, &rom->bus, &rom->device, + &rom->function); if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, romimage, romsize); return status; free_struct: @@ -175,7 +189,7 @@ static void setup_efi_pci32(struct boot_params *params, void **pci_handle, unsigned long size) { - efi_pci_io_protocol_32 *pci = NULL; + efi_pci_io_protocol_t *pci = NULL; efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; u32 *handles = (u32 *)(unsigned long)pci_handle; efi_status_t status; @@ -202,7 +216,7 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle, if (!pci) continue; - status = __setup_efi_pci32(pci, &rom); + status = __setup_efi_pci(pci, &rom); if (status != EFI_SUCCESS) continue; @@ -216,73 +230,11 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle, } } -static efi_status_t -__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) -{ - struct pci_setup_rom *rom; - efi_status_t status; - unsigned long size; - uint64_t attributes; - - status = efi_early->call(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, - &attributes); - if (status != EFI_SUCCESS) - return status; - - if (!pci->romimage || !pci->romsize) - return EFI_INVALID_PARAMETER; - - size = pci->romsize + sizeof(*rom); - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for rom\n"); - return status; - } - - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; - *__rom = rom; - - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_VENDOR_ID, 1, &(rom->vendor)); - - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to read rom->vendor\n"); - goto free_struct; - } - - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_DEVICE_ID, 1, &(rom->devid)); - - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to read rom->devid\n"); - goto free_struct; - } - - status = efi_early->call(pci->get_location, pci, &(rom->segment), - &(rom->bus), &(rom->device), &(rom->function)); - - if (status != EFI_SUCCESS) - goto free_struct; - - memcpy(rom->romdata, pci->romimage, pci->romsize); - return status; - -free_struct: - efi_call_early(free_pool, rom); - return status; - -} - static void setup_efi_pci64(struct boot_params *params, void **pci_handle, unsigned long size) { - efi_pci_io_protocol_64 *pci = NULL; + efi_pci_io_protocol_t *pci = NULL; efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; u64 *handles = (u64 *)(unsigned long)pci_handle; efi_status_t status; @@ -309,7 +261,7 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle, if (!pci) continue; - status = __setup_efi_pci64(pci, &rom); + status = __setup_efi_pci(pci, &rom); if (status != EFI_SUCCESS) continue; diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fca012baba19..64037895b085 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -306,6 +306,25 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp /* + * paging_prepare() and cleanup_trampoline() below can have GOT + * references. Adjust the table with address we are running at. + * + * Zero RAX for adjust_got: the GOT was not adjusted before; + * there's no adjustment to undo. + */ + xorq %rax, %rax + + /* + * Calculate the address the binary is loaded at and use it as + * a GOT adjustment. + */ + call 1f +1: popq %rdi + subq $1b, %rdi + + call adjust_got + + /* * At this point we are in long mode with 4-level paging enabled, * but we might want to enable 5-level paging or vice versa. * @@ -346,6 +365,7 @@ ENTRY(startup_64) * this function call. */ pushq %rsi + movq %rsi, %rdi /* real mode address */ call paging_prepare popq %rsi @@ -370,10 +390,14 @@ trampoline_return: /* * cleanup_trampoline() would restore trampoline memory. * + * RDI is address of the page table to use instead of page table + * in trampoline memory (if required). + * * RSI holds real mode data and needs to be preserved across * this function call. */ pushq %rsi + leaq top_pgtable(%rbx), %rdi call cleanup_trampoline popq %rsi @@ -381,6 +405,21 @@ trampoline_return: pushq $0 popfq + /* + * Previously we've adjusted the GOT with address the binary was + * loaded at. Now we need to re-adjust for relocation address. + * + * Calculate the address the binary is loaded at, so that we can + * undo the previous GOT adjustment. + */ + call 1f +1: popq %rax + subq $1b, %rax + + /* The new adjustment is the relocation address */ + movq %rbx, %rdi + call adjust_got + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -482,19 +521,6 @@ relocated: rep stosq /* - * Adjust our own GOT - */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - addq %rbx, (%rdx) - addq $8, %rdx - jmp 1b -2: - -/* * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ @@ -512,6 +538,27 @@ relocated: */ jmp *%rax +/* + * Adjust the global offset table + * + * RAX is the previous adjustment of the table to undo (use 0 if it's the + * first time we touch GOT). + * RDI is the new adjustment to apply. + */ +adjust_got: + /* Walk through the GOT adding the address to the entries */ + leaq _got(%rip), %rdx + leaq _egot(%rip), %rcx +1: + cmpq %rcx, %rdx + jae 2f + subq %rax, (%rdx) /* Undo previous adjustment */ + addq %rdi, (%rdx) /* Apply the new adjustment */ + addq $8, %rdx + jmp 1b +2: + ret + .code32 /* * This is the 32-bit trampoline that will be copied over to low memory. @@ -649,3 +696,10 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + */ +top_pgtable: + .fill PAGE_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a0a50b91ecef..b87a7582853d 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -47,7 +47,7 @@ #include <linux/decompress/mm.h> #ifdef CONFIG_X86_5LEVEL -unsigned int pgtable_l5_enabled __ro_after_init; +unsigned int __pgtable_l5_enabled; unsigned int pgdir_shift __ro_after_init = 39; unsigned int ptrs_per_p4d __ro_after_init = 1; #endif @@ -734,7 +734,7 @@ void choose_random_location(unsigned long input, #ifdef CONFIG_X86_5LEVEL if (__read_cr4() & X86_CR4_LA57) { - pgtable_l5_enabled = 1; + __pgtable_l5_enabled = 1; pgdir_shift = 48; ptrs_per_p4d = 512; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9e11be4cae19..a423bdb42686 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,10 +12,8 @@ #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN -#ifdef CONFIG_X86_5LEVEL -/* cpu_feature_enabled() cannot be used that early */ -#define pgtable_l5_enabled __pgtable_l5_enabled -#endif +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 #include <linux/linkage.h> #include <linux/screen_info.h> diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 32af1cbcd903..8c5107545251 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -23,14 +23,6 @@ struct paging_config { static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; /* - * The page table is going to be used instead of page table in the trampoline - * memory. - * - * It must not be in BSS as BSS is cleared after cleanup_trampoline(). - */ -static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); - -/* * Trampoline address will be printed by extract_kernel() for debugging * purposes. * @@ -39,16 +31,23 @@ static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); */ unsigned long *trampoline_32bit __section(.data); -struct paging_config paging_prepare(void) +extern struct boot_params *boot_params; +int cmdline_find_option_bool(const char *option); + +struct paging_config paging_prepare(void *rmode) { struct paging_config paging_config = {}; unsigned long bios_start, ebda_start; + /* Initialize boot_params. Required for cmdline_find_option_bool(). */ + boot_params = rmode; + /* * Check if LA57 is desired and supported. * - * There are two parts to the check: + * There are several parts to the check: * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y + * - if user asked to disable 5-level paging: no5lvl in cmdline * - if the machine supports 5-level paging: * + CPUID leaf 7 is supported * + the leaf has the feature bit set @@ -56,6 +55,7 @@ struct paging_config paging_prepare(void) * That's substitute for boot_cpu_has() in early boot code. */ if (IS_ENABLED(CONFIG_X86_5LEVEL) && + !cmdline_find_option_bool("no5lvl") && native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { paging_config.l5_required = 1; @@ -134,19 +134,19 @@ out: return paging_config; } -void cleanup_trampoline(void) +void cleanup_trampoline(void *pgtable) { void *trampoline_pgtable; - trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET; + trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long); /* * Move the top level page table out of trampoline memory, * if it's there. */ if ((void *)__native_read_cr3() == trampoline_pgtable) { - memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); - native_write_cr3((unsigned long)top_pgtable); + memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); } /* Restore trampoline memory */ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d6b27dab1b30..14a2f996e543 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -396,3 +396,4 @@ 382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free 383 i386 statx sys_statx __ia32_sys_statx 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl +385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 4dfe42666d0c..cd36232ab62f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -341,6 +341,7 @@ 330 common pkey_alloc __x64_sys_pkey_alloc 331 common pkey_free __x64_sys_pkey_free 332 common statx __x64_sys_statx +333 common io_pgetevents __x64_sys_io_pgetevents # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index d998a487c9b1..261802b1cc50 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -44,14 +44,14 @@ obj-y += $(vdso_img_objs) targets += $(vdso_img_cfiles) targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) -export CPPFLAGS_vdso.lds += -P -C +CPPFLAGS_vdso.lds += -P -C VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,--no-undefined \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ $(DISABLE_LTO) -$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE +$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi @@ -100,11 +100,8 @@ VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ -Wl,-z,max-page-size=4096 \ -Wl,-z,common-page-size=4096 -# 64-bit objects to re-brand as x32 -vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) - # x32-rebranded versions -vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) +vobjx32s-y := $(vobjs-y:.o=-x32.o) # same thing, but in the output directory vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) @@ -122,7 +119,7 @@ $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg $(call if_changed,objcopy) -$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE +$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 70b7845434cb..7782cdbcd67d 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -107,7 +107,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; - memset(&info, 0, sizeof(info)); + clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_MAPERR; diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 786fd875de92..4b98101209a1 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -889,7 +889,7 @@ static void force_ibs_eilvt_setup(void) if (!ibs_eilvt_valid()) goto out; - pr_info("IBS: LVT offset %d assigned\n", offset); + pr_info("LVT offset %d assigned\n", offset); return; out: diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index f5cbbba99283..981ba5e8241b 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -19,6 +19,7 @@ #include <asm/cpufeature.h> #include <asm/perf_event.h> #include <asm/msr.h> +#include <asm/smp.h> #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 @@ -399,26 +400,8 @@ static int amd_uncore_cpu_starting(unsigned int cpu) } if (amd_uncore_llc) { - unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared, subleaf, prev_eax = 0; - uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - /* - * Iterate over Cache Topology Definition leaves until no - * more cache descriptions are available. - */ - for (subleaf = 0; subleaf < 5; subleaf++) { - cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); - - /* EAX[0:4] gives type of cache */ - if (!(eax & 0x1f)) - break; - - prev_eax = eax; - } - nshared = ((prev_eax >> 14) & 0xfff) + 1; - - uncore->id = apicid - (apicid % nshared); + uncore->id = per_cpu(cpu_llc_id, cpu); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 45b2b1c93d04..6e461fb1e0d4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2397,7 +2397,7 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_IA32_EMULATION -#include <asm/compat.h> +#include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 3b993942a0e4..8d016ce5b80d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) filter->action == PERF_ADDR_FILTER_ACTION_START) return -EOPNOTSUPP; - if (!filter->inode) { + if (!filter->path.dentry) { if (!valid_kernel_ip(filter->offset)) return -EINVAL; @@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event) return; list_for_each_entry(filter, &head->list, entry) { - if (filter->inode && !offs[range]) { + if (filter->path.dentry && !offs[range]) { msr_a = msr_b = 0; } else { /* apply the offset */ diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index a7956fc7ca1d..15b07379e72d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -203,7 +203,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, hwc->idx = idx; hwc->last_tag = ++box->tags[idx]; - if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + if (uncore_pmc_fixed(hwc->idx)) { hwc->event_base = uncore_fixed_ctr(box); hwc->config_base = uncore_fixed_ctl(box); return; @@ -218,7 +218,9 @@ void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *e u64 prev_count, new_count, delta; int shift; - if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + if (uncore_pmc_freerunning(event->hw.idx)) + shift = 64 - uncore_freerunning_bits(box, event); + else if (uncore_pmc_fixed(event->hw.idx)) shift = 64 - uncore_fixed_ctr_bits(box); else shift = 64 - uncore_perf_ctr_bits(box); @@ -449,15 +451,30 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int return ret ? -EINVAL : 0; } -static void uncore_pmu_event_start(struct perf_event *event, int flags) +void uncore_pmu_event_start(struct perf_event *event, int flags) { struct intel_uncore_box *box = uncore_event_to_box(event); int idx = event->hw.idx; - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) return; - if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + /* + * Free running counter is read-only and always active. + * Use the current counter value as start point. + * There is no overflow interrupt for free running counter. + * Use hrtimer to periodically poll the counter to avoid overflow. + */ + if (uncore_pmc_freerunning(event->hw.idx)) { + list_add_tail(&event->active_entry, &box->active_list); + local64_set(&event->hw.prev_count, + uncore_read_counter(box, event)); + if (box->n_active++ == 0) + uncore_pmu_start_hrtimer(box); + return; + } + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; event->hw.state = 0; @@ -474,11 +491,20 @@ static void uncore_pmu_event_start(struct perf_event *event, int flags) } } -static void uncore_pmu_event_stop(struct perf_event *event, int flags) +void uncore_pmu_event_stop(struct perf_event *event, int flags) { struct intel_uncore_box *box = uncore_event_to_box(event); struct hw_perf_event *hwc = &event->hw; + /* Cannot disable free running counter which is read-only */ + if (uncore_pmc_freerunning(hwc->idx)) { + list_del(&event->active_entry); + if (--box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + uncore_perf_event_update(box, event); + return; + } + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { uncore_disable_event(box, event); box->n_active--; @@ -502,7 +528,7 @@ static void uncore_pmu_event_stop(struct perf_event *event, int flags) } } -static int uncore_pmu_event_add(struct perf_event *event, int flags) +int uncore_pmu_event_add(struct perf_event *event, int flags) { struct intel_uncore_box *box = uncore_event_to_box(event); struct hw_perf_event *hwc = &event->hw; @@ -512,6 +538,17 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags) if (!box) return -ENODEV; + /* + * The free funning counter is assigned in event_init(). + * The free running counter event and free running counter + * are 1:1 mapped. It doesn't need to be tracked in event_list. + */ + if (uncore_pmc_freerunning(hwc->idx)) { + if (flags & PERF_EF_START) + uncore_pmu_event_start(event, 0); + return 0; + } + ret = n = uncore_collect_events(box, event, false); if (ret < 0) return ret; @@ -563,13 +600,21 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags) return 0; } -static void uncore_pmu_event_del(struct perf_event *event, int flags) +void uncore_pmu_event_del(struct perf_event *event, int flags) { struct intel_uncore_box *box = uncore_event_to_box(event); int i; uncore_pmu_event_stop(event, PERF_EF_UPDATE); + /* + * The event for free running counter is not tracked by event_list. + * It doesn't need to force event->hw.idx = -1 to reassign the counter. + * Because the event and the free running counter are 1:1 mapped. + */ + if (uncore_pmc_freerunning(event->hw.idx)) + return; + for (i = 0; i < box->n_events; i++) { if (event == box->event_list[i]) { uncore_put_event_constraint(box, event); @@ -603,6 +648,10 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, struct intel_uncore_box *fake_box; int ret = -EINVAL, n; + /* The free running counter is always active. */ + if (uncore_pmc_freerunning(event->hw.idx)) + return 0; + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); if (!fake_box) return -ENOMEM; @@ -690,6 +739,17 @@ static int uncore_pmu_event_init(struct perf_event *event) /* fixed counters have event field hardcoded to zero */ hwc->config = 0ULL; + } else if (is_freerunning_event(event)) { + if (!check_valid_freerunning_event(box, event)) + return -EINVAL; + event->hw.idx = UNCORE_PMC_IDX_FREERUNNING; + /* + * The free running counter event and free running counter + * are always 1:1 mapped. + * The free running counter is always active. + * Assign the free running counter here. + */ + event->hw.event_base = uncore_freerunning_counter(box, event); } else { hwc->config = event->attr.config & (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32)); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 414dc7e7c950..c9e1e0bef3c3 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -12,8 +12,13 @@ #define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_MAX_FIXED 1 +#define UNCORE_PMC_IDX_MAX_FREERUNNING 1 #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC -#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PMC_IDX_FREERUNNING (UNCORE_PMC_IDX_FIXED + \ + UNCORE_PMC_IDX_MAX_FIXED) +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FREERUNNING + \ + UNCORE_PMC_IDX_MAX_FREERUNNING) #define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \ ((dev << 24) | (func << 16) | (type << 8) | idx) @@ -35,6 +40,7 @@ struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; struct uncore_event_desc; +struct freerunning_counters; struct intel_uncore_type { const char *name; @@ -42,6 +48,7 @@ struct intel_uncore_type { int num_boxes; int perf_ctr_bits; int fixed_ctr_bits; + int num_freerunning_types; unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; @@ -59,6 +66,7 @@ struct intel_uncore_type { struct intel_uncore_pmu *pmus; struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; + struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; struct pmu *pmu; /* for custom pmu ops */ }; @@ -129,6 +137,14 @@ struct uncore_event_desc { const char *config; }; +struct freerunning_counters { + unsigned int counter_base; + unsigned int counter_offset; + unsigned int box_offset; + unsigned int num_counters; + unsigned int bits; +}; + struct pci2phy_map { struct list_head list; int segment; @@ -157,6 +173,16 @@ static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __uncore_##_var##_show, NULL) +static inline bool uncore_pmc_fixed(int idx) +{ + return idx == UNCORE_PMC_IDX_FIXED; +} + +static inline bool uncore_pmc_freerunning(int idx) +{ + return idx == UNCORE_PMC_IDX_FREERUNNING; +} + static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) { return box->pmu->type->box_ctl; @@ -214,6 +240,60 @@ static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); } + +/* + * In the uncore document, there is no event-code assigned to free running + * counters. Some events need to be defined to indicate the free running + * counters. The events are encoded as event-code + umask-code. + * + * The event-code for all free running counters is 0xff, which is the same as + * the fixed counters. + * + * The umask-code is used to distinguish a fixed counter and a free running + * counter, and different types of free running counters. + * - For fixed counters, the umask-code is 0x0X. + * X indicates the index of the fixed counter, which starts from 0. + * - For free running counters, the umask-code uses the rest of the space. + * It would bare the format of 0xXY. + * X stands for the type of free running counters, which starts from 1. + * Y stands for the index of free running counters of same type, which + * starts from 0. + * + * For example, there are three types of IIO free running counters on Skylake + * server, IO CLOCKS counters, BANDWIDTH counters and UTILIZATION counters. + * The event-code for all the free running counters is 0xff. + * 'ioclk' is the first counter of IO CLOCKS. IO CLOCKS is the first type, + * which umask-code starts from 0x10. + * So 'ioclk' is encoded as event=0xff,umask=0x10 + * 'bw_in_port2' is the third counter of BANDWIDTH counters. BANDWIDTH is + * the second type, which umask-code starts from 0x20. + * So 'bw_in_port2' is encoded as event=0xff,umask=0x22 + */ +static inline unsigned int uncore_freerunning_idx(u64 config) +{ + return ((config >> 8) & 0xf); +} + +#define UNCORE_FREERUNNING_UMASK_START 0x10 + +static inline unsigned int uncore_freerunning_type(u64 config) +{ + return ((((config >> 8) - UNCORE_FREERUNNING_UMASK_START) >> 4) & 0xf); +} + +static inline +unsigned int uncore_freerunning_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + unsigned int type = uncore_freerunning_type(event->attr.config); + unsigned int idx = uncore_freerunning_idx(event->attr.config); + struct intel_uncore_pmu *pmu = box->pmu; + + return pmu->type->freerunning[type].counter_base + + pmu->type->freerunning[type].counter_offset * idx + + pmu->type->freerunning[type].box_offset * pmu->pmu_idx; +} + static inline unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { @@ -276,11 +356,52 @@ static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) return box->pmu->type->fixed_ctr_bits; } +static inline +unsigned int uncore_freerunning_bits(struct intel_uncore_box *box, + struct perf_event *event) +{ + unsigned int type = uncore_freerunning_type(event->attr.config); + + return box->pmu->type->freerunning[type].bits; +} + +static inline int uncore_num_freerunning(struct intel_uncore_box *box, + struct perf_event *event) +{ + unsigned int type = uncore_freerunning_type(event->attr.config); + + return box->pmu->type->freerunning[type].num_counters; +} + +static inline int uncore_num_freerunning_types(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->num_freerunning_types; +} + +static inline bool check_valid_freerunning_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + unsigned int type = uncore_freerunning_type(event->attr.config); + unsigned int idx = uncore_freerunning_idx(event->attr.config); + + return (type < uncore_num_freerunning_types(box, event)) && + (idx < uncore_num_freerunning(box, event)); +} + static inline int uncore_num_counters(struct intel_uncore_box *box) { return box->pmu->type->num_counters; } +static inline bool is_freerunning_event(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + return ((cfg & UNCORE_FIXED_EVENT) == UNCORE_FIXED_EVENT) && + (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START); +} + static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -346,6 +467,10 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +void uncore_pmu_event_start(struct perf_event *event, int flags); +void uncore_pmu_event_stop(struct perf_event *event, int flags); +int uncore_pmu_event_add(struct perf_event *event, int flags); +void uncore_pmu_event_del(struct perf_event *event, int flags); void uncore_pmu_event_read(struct perf_event *event); void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); struct event_constraint * diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 93e7a8397cde..173e2674be6e 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -246,7 +246,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p { struct hw_perf_event *hwc = &event->hw; - if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + if (hwc->idx == UNCORE_PMC_IDX_FIXED) wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index aee5e8496be4..8527c3e1038b 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -285,6 +285,15 @@ static struct uncore_event_desc snb_uncore_imc_events[] = { #define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 #define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE +enum perf_snb_uncore_imc_freerunning_types { + SNB_PCI_UNCORE_IMC_DATA = 0, + SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters snb_uncore_imc_freerunning[] = { + [SNB_PCI_UNCORE_IMC_DATA] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 }, +}; + static struct attribute *snb_uncore_imc_formats_attr[] = { &format_attr_event.attr, NULL, @@ -341,9 +350,8 @@ static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf } /* - * custom event_init() function because we define our own fixed, free - * running counters, so we do not want to conflict with generic uncore - * logic. Also simplifies processing + * Keep the custom event_init() function compatible with old event + * encoding for free running counters. */ static int snb_uncore_imc_event_init(struct perf_event *event) { @@ -405,11 +413,11 @@ static int snb_uncore_imc_event_init(struct perf_event *event) switch (cfg) { case SNB_UNCORE_PCI_IMC_DATA_READS: base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; - idx = UNCORE_PMC_IDX_FIXED; + idx = UNCORE_PMC_IDX_FREERUNNING; break; case SNB_UNCORE_PCI_IMC_DATA_WRITES: base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; - idx = UNCORE_PMC_IDX_FIXED + 1; + idx = UNCORE_PMC_IDX_FREERUNNING; break; default: return -EINVAL; @@ -430,75 +438,6 @@ static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_ev return 0; } -static void snb_uncore_imc_event_start(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - u64 count; - - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - event->hw.state = 0; - box->n_active++; - - list_add_tail(&event->active_entry, &box->active_list); - - count = snb_uncore_imc_read_counter(box, event); - local64_set(&event->hw.prev_count, count); - - if (box->n_active == 1) - uncore_pmu_start_hrtimer(box); -} - -static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!(hwc->state & PERF_HES_STOPPED)) { - box->n_active--; - - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - - list_del(&event->active_entry); - - if (box->n_active == 0) - uncore_pmu_cancel_hrtimer(box); - } - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - uncore_perf_event_update(box, event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static int snb_uncore_imc_event_add(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!box) - return -ENODEV; - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - if (!(flags & PERF_EF_START)) - hwc->state |= PERF_HES_ARCH; - - snb_uncore_imc_event_start(event, 0); - - return 0; -} - -static void snb_uncore_imc_event_del(struct perf_event *event, int flags) -{ - snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); -} - int snb_pci2phy_map_init(int devid) { struct pci_dev *dev = NULL; @@ -530,10 +469,10 @@ int snb_pci2phy_map_init(int devid) static struct pmu snb_uncore_imc_pmu = { .task_ctx_nr = perf_invalid_context, .event_init = snb_uncore_imc_event_init, - .add = snb_uncore_imc_event_add, - .del = snb_uncore_imc_event_del, - .start = snb_uncore_imc_event_start, - .stop = snb_uncore_imc_event_stop, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, }; @@ -552,12 +491,10 @@ static struct intel_uncore_type snb_uncore_imc = { .name = "imc", .num_counters = 2, .num_boxes = 1, - .fixed_ctr_bits = 32, - .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, - .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, - .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, .ops = &snb_uncore_imc_ops, .pmu = &snb_uncore_imc_pmu, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 77076a102e34..87dc0263a2e1 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3522,6 +3522,87 @@ static struct intel_uncore_type skx_uncore_iio = { .format_group = &skx_uncore_iio_format_group, }; +enum perf_uncore_iio_freerunning_type_id { + SKX_IIO_MSR_IOCLK = 0, + SKX_IIO_MSR_BW = 1, + SKX_IIO_MSR_UTIL = 2, + + SKX_IIO_FREERUNNING_TYPE_MAX, +}; + + +static struct freerunning_counters skx_iio_freerunning[] = { + [SKX_IIO_MSR_IOCLK] = { 0xa45, 0x1, 0x20, 1, 36 }, + [SKX_IIO_MSR_BW] = { 0xb00, 0x1, 0x10, 8, 36 }, + [SKX_IIO_MSR_UTIL] = { 0xb08, 0x1, 0x10, 8, 36 }, +}; + +static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = { + /* Free-Running IO CLOCKS Counter */ + INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), + /* Free-Running IIO BANDWIDTH Counters */ + INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x24"), + INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x25"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x26"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x27"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"), + /* Free-running IIO UTILIZATION Counters */ + INTEL_UNCORE_EVENT_DESC(util_in_port0, "event=0xff,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(util_out_port0, "event=0xff,umask=0x31"), + INTEL_UNCORE_EVENT_DESC(util_in_port1, "event=0xff,umask=0x32"), + INTEL_UNCORE_EVENT_DESC(util_out_port1, "event=0xff,umask=0x33"), + INTEL_UNCORE_EVENT_DESC(util_in_port2, "event=0xff,umask=0x34"), + INTEL_UNCORE_EVENT_DESC(util_out_port2, "event=0xff,umask=0x35"), + INTEL_UNCORE_EVENT_DESC(util_in_port3, "event=0xff,umask=0x36"), + INTEL_UNCORE_EVENT_DESC(util_out_port3, "event=0xff,umask=0x37"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = { + .read_counter = uncore_msr_read_counter, +}; + +static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL, +}; + +static const struct attribute_group skx_uncore_iio_freerunning_format_group = { + .name = "format", + .attrs = skx_uncore_iio_freerunning_formats_attr, +}; + +static struct intel_uncore_type skx_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 17, + .num_boxes = 6, + .num_freerunning_types = SKX_IIO_FREERUNNING_TYPE_MAX, + .freerunning = skx_iio_freerunning, + .ops = &skx_uncore_iio_freerunning_ops, + .event_descs = skx_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + static struct attribute *skx_uncore_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -3595,6 +3676,7 @@ static struct intel_uncore_type *skx_msr_uncores[] = { &skx_uncore_ubox, &skx_uncore_chabox, &skx_uncore_iio, + &skx_uncore_iio_free_running, &skx_uncore_irp, &skx_uncore_pcu, NULL, diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h new file mode 100644 index 000000000000..e958e28f7ab5 --- /dev/null +++ b/arch/x86/include/asm/cacheinfo.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CACHEINFO_H +#define _ASM_X86_CACHEINFO_H + +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); + +#endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index e1c8dab86670..fb97cf7c4137 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -17,7 +17,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u16 __compat_uid_t; @@ -46,16 +45,6 @@ typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; u16 __pad1; @@ -145,10 +134,10 @@ struct compat_ipc64_perm { struct compat_semid64_ds { struct compat_ipc64_perm sem_perm; - compat_time_t sem_otime; - compat_ulong_t __unused1; - compat_time_t sem_ctime; - compat_ulong_t __unused2; + compat_ulong_t sem_otime; + compat_ulong_t sem_otime_high; + compat_ulong_t sem_ctime; + compat_ulong_t sem_ctime_high; compat_ulong_t sem_nsems; compat_ulong_t __unused3; compat_ulong_t __unused4; @@ -156,12 +145,12 @@ struct compat_semid64_ds { struct compat_msqid64_ds { struct compat_ipc64_perm msg_perm; - compat_time_t msg_stime; - compat_ulong_t __unused1; - compat_time_t msg_rtime; - compat_ulong_t __unused2; - compat_time_t msg_ctime; - compat_ulong_t __unused3; + compat_ulong_t msg_stime; + compat_ulong_t msg_stime_high; + compat_ulong_t msg_rtime; + compat_ulong_t msg_rtime_high; + compat_ulong_t msg_ctime; + compat_ulong_t msg_ctime_high; compat_ulong_t msg_cbytes; compat_ulong_t msg_qnum; compat_ulong_t msg_qbytes; @@ -174,12 +163,12 @@ struct compat_msqid64_ds { struct compat_shmid64_ds { struct compat_ipc64_perm shm_perm; compat_size_t shm_segsz; - compat_time_t shm_atime; - compat_ulong_t __unused1; - compat_time_t shm_dtime; - compat_ulong_t __unused2; - compat_time_t shm_ctime; - compat_ulong_t __unused3; + compat_ulong_t shm_atime; + compat_ulong_t shm_atime_high; + compat_ulong_t shm_dtime; + compat_ulong_t shm_dtime_high; + compat_ulong_t shm_ctime; + compat_ulong_t shm_ctime_high; compat_pid_t shm_cpid; compat_pid_t shm_lpid; compat_ulong_t shm_nattch; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index b27da9602a6d..aced6c9290d6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -140,6 +140,20 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) +#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO) + +/* + * Workaround for the sake of BPF compilation which utilizes kernel + * headers, but clang does not support ASM GOTO and fails the build. + */ +#ifndef __BPF_TRACING__ +#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" +#endif + +#define static_cpu_has(bit) boot_cpu_has(bit) + +#else + /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These will statically patch the target code for additional @@ -195,6 +209,7 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) +#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 578793e97431..fb00a2fca990 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -198,7 +198,6 @@ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ - #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ @@ -207,13 +206,19 @@ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ - +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ #define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ - #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -274,9 +279,10 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -334,6 +340,7 @@ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ /* * BUG word(s) @@ -363,5 +370,6 @@ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 89ce4bfd241f..ce4d176b3d13 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return dma_ops; } -int arch_dma_supported(struct device *dev, u64 mask); -#define arch_dma_supported arch_dma_supported - -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +bool arch_dma_alloc_attrs(struct device **dev); #define arch_dma_alloc_attrs arch_dma_alloc_attrs #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index cc8f8fcf9b4a..c18ed65287d5 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -63,7 +63,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name #ifndef COMPILE_OFFSETS #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) -#include <asm/compat.h> +#include <linux/compat.h> /* * Because ia32 syscalls do not map to x86_64 syscall numbers diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 5ea2afd4c871..740a428acf1e 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -50,14 +50,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) -#define local_softirq_pending() this_cpu_read(irq_stat.__softirq_pending) - -#define __ARCH_SET_SOFTIRQ_PENDING - -#define set_softirq_pending(x) \ - this_cpu_write(irq_stat.__softirq_pending, (x)) -#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x)) - extern void ack_bad_irq(unsigned int irq); extern u64 arch_irq_stat_cpu(unsigned int cpu); diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index b3e32b010ab1..c2c01f84df75 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +#define POP_SS_OPCODE 0x1f +#define MOV_SREG_OPCODE 0x8e + +/* + * Intel SDM Vol.3A 6.8.3 states; + * "Any single-step trap that would be delivered following the MOV to SS + * instruction or POP to SS instruction (because EFLAGS.TF is 1) is + * suppressed." + * This function returns true if @insn is MOV SS or POP SS. On these + * instructions, single stepping is suppressed. + */ +static inline int insn_masking_exception(struct insn *insn) +{ + return insn->opcode.bytes[0] == POP_SS_OPCODE || + (insn->opcode.bytes[0] == MOV_SREG_OPCODE && + X86_MODRM_REG(insn->modrm.bytes[0]) == 2); +} + #endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h index 35555016b1be..0b44b1abe4d9 100644 --- a/arch/x86/include/asm/intel_mid_vrtc.h +++ b/arch/x86/include/asm/intel_mid_vrtc.h @@ -4,7 +4,7 @@ extern unsigned char vrtc_cmos_read(unsigned char reg); extern void vrtc_cmos_write(unsigned char val, unsigned char reg); -extern void vrtc_get_time(struct timespec *now); -extern int vrtc_set_mmss(const struct timespec *now); +extern void vrtc_get_time(struct timespec64 *now); +extern int vrtc_set_mmss(const struct timespec64 *now); #endif diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index f6e5b9375d8c..6de64840dd22 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -94,10 +94,10 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 -build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) -build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) +build_mmio_read(readq, "q", u64, "=r", :"memory") +build_mmio_read(__readq, "q", u64, "=r", ) +build_mmio_write(writeq, "q", u64, "r", :"memory") +build_mmio_write(__writeq, "q", u64, "r", ) #define readq_relaxed(a) __readq(a) #define writeq_relaxed(v, a) __writeq(v, a) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c25775fad4ed..f4b2588865e9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -924,7 +924,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); - bool (*cpu_has_high_real_mode_segbase)(void); + bool (*has_emulated_msr)(int index); void (*cpuid_update)(struct kvm_vcpu *vcpu); struct kvm *(*vm_alloc)(void); diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 1775a32f7ea6..97198001e567 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void) unsigned char rtc_cmos_read(unsigned char addr); void rtc_cmos_write(unsigned char val, unsigned char addr); -extern int mach_set_rtc_mmss(const struct timespec *now); -extern void mach_get_cmos_time(struct timespec *now); +extern int mach_set_rtc_mmss(const struct timespec64 *now); +extern void mach_get_cmos_time(struct timespec64 *now); #define RTC_IRQ 8 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 57e3785d0d26..cf9911b5a53c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { - /* pkey 0 is the default and always allocated */ + /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b9255e..fda2114197b3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -42,6 +42,8 @@ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ #define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ +#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ @@ -68,6 +70,11 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SSB_NO (1 << 4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e @@ -340,6 +347,8 @@ #define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f928ad9b143f..8b38df98548e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -217,6 +217,14 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* The Speculative Store Bypass disable variants */ +enum ssb_mitigation { + SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_DISABLE, + SPEC_STORE_BYPASS_PRCTL, + SPEC_STORE_BYPASS_SECCOMP, +}; + extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; @@ -241,22 +249,27 @@ static inline void vmexit_fill_RSB(void) #endif } -#define alternative_msr_write(_msr, _val, _feature) \ - asm volatile(ALTERNATIVE("", \ - "movl %[msr], %%ecx\n\t" \ - "movl %[val], %%eax\n\t" \ - "movl $0, %%edx\n\t" \ - "wrmsr", \ - _feature) \ - : : [msr] "i" (_msr), [val] "i" (_val) \ - : "eax", "ecx", "edx", "memory") +static __always_inline +void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) +{ + asm volatile(ALTERNATIVE("", "wrmsr", %c[feature]) + : : "c" (msr), + "a" ((u32)val), + "d" ((u32)(val >> 32)), + [feature] "i" (feature) + : "memory"); +} static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, - X86_FEATURE_USE_IBPB); + u64 val = PRED_CMD_IBPB; + + alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); } +/* The Intel SPEC CTRL MSR base value cache */ +extern u64 x86_spec_ctrl_base; + /* * With retpoline, we must use IBRS to restrict branch prediction * before calling into firmware. @@ -265,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void) */ #define firmware_restrict_branch_speculation_start() \ do { \ + u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ + \ preempt_disable(); \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, 0, \ + u64 val = x86_spec_ctrl_base; \ + \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 2c5a966dc222..6afac386a434 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -53,7 +53,7 @@ #define __PHYSICAL_MASK_SHIFT 52 #ifdef CONFIG_X86_5LEVEL -#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47) +#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) #else #define __VIRTUAL_MASK_SHIFT 47 #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9be2bf13825b..d49bbf4bb5c8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -574,14 +574,14 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) } #define set_pgd(pgdp, pgdval) do { \ - if (pgtable_l5_enabled) \ + if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ - if (pgtable_l5_enabled) \ + if (pgtable_l5_enabled()) \ set_pgd(pgdp, __pgd(0)); \ } while (0) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d32175e30259..662963681ea6 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev); #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif - -#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 263c142a6a6c..ada6410fd2ec 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -167,7 +167,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); @@ -193,7 +193,7 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) ___p4d_free_tlb(tlb, p4d); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f1633de5a675..99ecde23c3ec 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags; #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) -#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0) +#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d @@ -881,7 +881,7 @@ static inline unsigned long p4d_index(unsigned long address) #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } @@ -898,9 +898,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static __always_inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } @@ -909,7 +909,7 @@ static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return 0; if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) @@ -920,7 +920,7 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return 0; /* * There is no need to do a workaround for the KNL stray diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index e3225e83db7d..d9a001a4a872 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -15,7 +15,7 @@ # include <asm/pgtable-2level_types.h> #endif -#define pgtable_l5_enabled 0 +#define pgtable_l5_enabled() 0 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 877bc27718ae..3c5385f9a88f 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -220,7 +220,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { *p4dp = p4d; return; } diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index adb47552e6bb..054765ab2da2 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -22,12 +22,23 @@ typedef struct { pteval_t pte; } pte_t; #ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; -#ifndef pgtable_l5_enabled -#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57) -#endif + +#ifdef USE_EARLY_PGTABLE_L5 +/* + * cpu_feature_enabled() is not available in early boot code. + * Use variable instead. + */ +static inline bool pgtable_l5_enabled(void) +{ + return __pgtable_l5_enabled; +} #else -#define pgtable_l5_enabled 0 -#endif +#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) +#endif /* USE_EARLY_PGTABLE_L5 */ + +#else +#define pgtable_l5_enabled() 0 +#endif /* CONFIG_X86_5LEVEL */ extern unsigned int pgdir_shift; extern unsigned int ptrs_per_p4d; @@ -102,7 +113,7 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY_L4 -3UL #define LDT_PGD_ENTRY_L5 -112UL -#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) +#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #define __VMALLOC_BASE_L4 0xffffc90000000000UL @@ -116,7 +127,7 @@ extern unsigned int ptrs_per_p4d; #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base -# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) +# define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base #else # define VMALLOC_START __VMALLOC_BASE_L4 diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index a0ba1ffda0df..851c04b7a092 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H +#define ARCH_DEFAULT_PKEY 0 + #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, @@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return 0; + return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } @@ -49,13 +51,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned - * from pkey_alloc(). pkey 0 is special, and never - * returned from pkey_alloc(). + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. */ - if (pkey <= 0) + if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; + /* + * The exec-only pkey is set in the allocation map, but + * is not available to any of the user interfaces like + * mprotect_pkey(). + */ + if (pkey == mm->context.execute_only_pkey) + return false; + return mm_pkey_allocation_map(mm) & (1U << pkey); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 21a114914ba4..e28add6b791f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -186,15 +186,6 @@ extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern u32 get_scattered_cpuid_leaf(unsigned int level, - unsigned int sub_leaf, - enum cpuid_regs_idx reg); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); - -extern void detect_extended_topology(struct cpuinfo_x86 *c); -extern void detect_ht(struct cpuinfo_x86 *c); #ifdef CONFIG_X86_32 extern int have_cpuid_p(void); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index a7471dcd2205..b6033680d458 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -12,7 +12,7 @@ void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, - struct timespec *ts); + struct timespec64 *ts); void pvclock_resume(void); void pvclock_touch_watchdogs(void); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 5e16b5d40d32..3e70bed8a978 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -7,6 +7,14 @@ #include <asm-generic/qspinlock_types.h> #include <asm/paravirt.h> +#define _Q_PENDING_LOOPS (1 << 9) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + #define queued_spin_unlock queued_spin_unlock /** * queued_spin_unlock - release a queued spinlock @@ -16,15 +24,9 @@ */ static inline void native_queued_spin_unlock(struct qspinlock *lock) { - smp_store_release((u8 *)lock, 0); + smp_store_release(&lock->locked, 0); } -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_init_lock_hash(void); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); - static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { pv_queued_spin_lock_slowpath(lock, val); @@ -40,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu) { return pv_vcpu_is_preempted(cpu); } -#else -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - native_queued_spin_unlock(lock); -} #endif #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 923307ea11c7..9ef5ee03d2d7 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); * * void __pv_queued_spin_unlock(struct qspinlock *lock) * { - * struct __qspinlock *l = (void *)lock; - * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); * * if (likely(lockval == _Q_LOCKED_VAL)) * return; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f75bff8f9d82..547c4fe50711 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -171,7 +171,6 @@ static inline int wbinvd_on_all_cpus(void) wbinvd(); return 0; } -#define smp_num_siblings 1 #endif /* CONFIG_SMP */ extern unsigned disabled_cpus; diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4617a2bf123c..199218719a86 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,8 +27,8 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44) -# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46) +# define MAX_PHYSADDR_BITS (pgtable_l5_enabled() ? 52 : 44) +# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h new file mode 100644 index 000000000000..ae7c2c5cd7f0 --- /dev/null +++ b/arch/x86/include/asm/spec-ctrl.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SPECCTRL_H_ +#define _ASM_X86_SPECCTRL_H_ + +#include <linux/thread_info.h> +#include <asm/nospec-branch.h> + +/* + * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR + * the guest has, while on VMEXIT we restore the host view. This + * would be easier if SPEC_CTRL were architecturally maskable or + * shadowable for guests but this is not (currently) the case. + * Takes the guest view of SPEC_CTRL MSR as a parameter and also + * the guest's version of VIRT_SPEC_CTRL, if emulated. + */ +extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); + +/** + * x86_spec_ctrl_set_guest - Set speculation control registers for the guest + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); +} + +/** + * x86_spec_ctrl_restore_host - Restore host speculation control registers + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); +} + +/* AMD specific Speculative Store Bypass MSR data */ +extern u64 x86_amd_ls_cfg_base; +extern u64 x86_amd_ls_cfg_ssbd_mask; + +static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) +{ + return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; +} + +#ifdef CONFIG_SMP +extern void speculative_store_bypass_ht_init(void); +#else +static inline void speculative_store_bypass_ht_init(void) { } +#endif + +extern void speculative_store_bypass_update(unsigned long tif); + +static inline void speculative_store_bypass_update_current(void) +{ + speculative_store_bypass_update(current_thread_info()->flags); +} + +#endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 133d9425fced..b6dc698f992a 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -111,4 +111,6 @@ static inline unsigned long caller_frame_pointer(void) return (unsigned long)frame; } +void show_opcodes(u8 *rip, const char *loglvl); +void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 533f74c300c2..d33f92b9fa22 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct); #endif #define __HAVE_ARCH_MEMCPY_MCSAFE 1 -__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); +__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src, + size_t cnt); DECLARE_STATIC_KEY_FALSE(mcsafe_key); /** @@ -131,14 +132,15 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key); * actually do machine check recovery. Everyone else can just * use memcpy(). * - * Return 0 for success, -EFAULT for fail + * Return 0 for success, or number of bytes not copied if there was an + * exception. */ -static __always_inline __must_check int +static __always_inline __must_check unsigned long memcpy_mcsafe(void *dst, const void *src, size_t cnt) { #ifdef CONFIG_X86_MCE if (static_branch_unlikely(&mcsafe_key)) - return memcpy_mcsafe_unrolled(dst, src, cnt); + return __memcpy_mcsafe(dst, src, cnt); else #endif memcpy(dst, src, cnt); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a5d9521bb2cb..2ff2a30a264f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -79,6 +79,7 @@ struct thread_info { #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_SSBD 5 /* Reduced data speculation */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ @@ -105,6 +106,7 @@ struct thread_info { #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -144,7 +146,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 62546b3a398e..62acb613114b 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len) } static __always_inline __must_check unsigned long +copy_to_user_mcsafe(void *to, const void *from, unsigned len) +{ + unsigned long ret; + + __uaccess_begin(); + ret = memcpy_mcsafe(to, from, len); + __uaccess_end(); + return ret; +} + +static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { int ret = 0; @@ -194,4 +205,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) unsigned long copy_user_handle_tail(char *to, char *from, unsigned len); +unsigned long +mcsafe_handle_tail(char *to, char *from, unsigned len); + #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ce8b4da07e35..2d27236c16a3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -170,7 +170,7 @@ struct x86_cpuinit_ops { void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; -struct timespec; +struct timespec64; /** * struct x86_legacy_devices - legacy x86 devices @@ -264,8 +264,8 @@ struct x86_hyper_runtime { struct x86_platform_ops { unsigned long (*calibrate_cpu)(void); unsigned long (*calibrate_tsc)(void); - void (*get_wallclock)(struct timespec *ts); - int (*set_wallclock)(const struct timespec *ts); + void (*get_wallclock)(struct timespec64 *ts); + int (*set_wallclock)(const struct timespec64 *ts); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h index cabd7476bd6c..89de6cd9f0a7 100644 --- a/arch/x86/include/uapi/asm/sembuf.h +++ b/arch/x86/include/uapi/asm/sembuf.h @@ -8,15 +8,24 @@ * between kernel and user space. * * Pad space is left for: - * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values + * + * x86_64 and x32 incorrectly added padding here, so the structures + * are still incompatible with the padding on x86. */ struct semid64_ds { struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ +#ifdef __i386__ + unsigned long sem_otime; /* last semop time */ + unsigned long sem_otime_high; + unsigned long sem_ctime; /* last change time */ + unsigned long sem_ctime_high; +#else __kernel_time_t sem_otime; /* last semop time */ __kernel_ulong_t __unused1; __kernel_time_t sem_ctime; /* last change time */ __kernel_ulong_t __unused2; +#endif __kernel_ulong_t sem_nsems; /* no. of semaphores in array */ __kernel_ulong_t __unused3; __kernel_ulong_t __unused4; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8b04234e010b..7685444a106b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -116,6 +116,7 @@ static void init_x2apic_ldr(void) goto update; } cmsk = cluster_hotplug_mask; + cmsk->clusterid = cluster; cluster_hotplug_mask = NULL; update: this_cpu_write(cluster_masks, cmsk); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index dfcbe6924eaf..5d0de79fdab0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v) return 0; } -static int proc_apm_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_apm_show, NULL); -} - -static const struct file_operations apm_file_ops = { - .owner = THIS_MODULE, - .open = proc_apm_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int apm(void *unused) { unsigned short bx; @@ -2360,7 +2347,7 @@ static int __init apm_init(void) set_desc_base(&gdt[APM_DS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); - proc_create("apm", 0, NULL, &apm_file_ops); + proc_create_single("apm", 0, NULL, proc_apm_show); kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { @@ -2446,7 +2433,7 @@ MODULE_PARM_DESC(idle_threshold, "System idle percentage above which to make APM BIOS idle calls"); module_param(idle_period, int, 0444); MODULE_PARM_DESC(idle_period, - "Period (in sec/100) over which to caculate the idle percentage"); + "Period (in sec/100) over which to calculate the idle percentage"); module_param(smp, bool, 0444); MODULE_PARM_DESC(smp, "Set this to enable APM use on an SMP platform. Use with caution on older systems"); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a66229f51b12..7a40196967cb 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12bc0a1139da..082d7875cef8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,7 +9,9 @@ #include <linux/random.h> #include <asm/processor.h> #include <asm/apic.h> +#include <asm/cacheinfo.h> #include <asm/cpu.h> +#include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/pci-direct.h> #include <asm/delay.h> @@ -297,7 +299,6 @@ static int nearby_node(int apicid) } #endif -#ifdef CONFIG_SMP /* * Fix up cpu_core_id for pre-F17h systems to be in the * [0 .. cores_per_node - 1] range. Not really needed but @@ -327,6 +328,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -345,21 +347,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. + * In case leaf B is available, use it to derive + * topology information. */ - if (cpuid_edx(0x80000006)) { - if (c->x86 == 0x17) { - /* - * LLC is at the core complex level. - * Core complex id is ApicId[3]. - */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; - } else { - /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = node_id; - } - } + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + cacheinfo_amd_init_llc_id(c, cpu, node_id); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -375,7 +371,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) legacy_fixup_core_id(c); } } -#endif /* * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. @@ -383,7 +378,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -394,17 +388,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - amd_get_topology(c); -#endif } u16 amd_get_nb_id(int cpu) { - u16 id = 0; -#ifdef CONFIG_SMP - id = per_cpu(cpu_llc_id, cpu); -#endif - return id; + return per_cpu(cpu_llc_id, cpu); } EXPORT_SYMBOL_GPL(amd_get_nb_id); @@ -554,6 +542,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) rdmsrl(MSR_FAM10H_NODE_ID, value); nodes_per_socket = ((value >> 3) & 7) + 1; } + + if (c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -791,6 +799,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { + set_cpu_cap(c, X86_FEATURE_ZEN); /* * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects * all up to and including B1. @@ -842,6 +851,7 @@ static void init_amd(struct cpuinfo_x86 *c) /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { amd_detect_cmp(c); + amd_get_topology(c); srat_detect_node(c); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bfca937bdcc3..7416fc206b4a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,8 +12,10 @@ #include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/nospec.h> +#include <linux/prctl.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> @@ -27,6 +29,27 @@ #include <asm/intel-family.h> static void __init spectre_v2_select_mitigation(void); +static void __init ssb_select_mitigation(void); + +/* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any + * writes to SPEC_CTRL contain whatever reserved bits have been set. + */ +u64 __ro_after_init x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* + * The vendor and possibly platform specific bits which can be modified in + * x86_spec_ctrl_base. + */ +static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; void __init check_bugs(void) { @@ -37,9 +60,27 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* Allow STIBP in MSR_SPEC_CTRL if supported */ + if (boot_cpu_has(X86_FEATURE_STIBP)) + x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* Select the proper spectre mitigation before patching alternatives */ spectre_v2_select_mitigation(); + /* + * Select proper mitigation for any exposure to the Speculative Store + * Bypass vulnerability. + */ + ssb_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -93,7 +134,76 @@ static const char *spectre_v2_strings[] = { #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 msrval, guestval, hostval = x86_spec_ctrl_base; + struct thread_info *ti = current_thread_info(); + + /* Is MSR_SPEC_CTRL implemented ? */ + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + /* + * Restrict guest_spec_ctrl to supported values. Clear the + * modifiable bits in the host base value and or the + * modifiable bits from the guest value. + */ + guestval = hostval & ~x86_spec_ctrl_mask; + guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + + /* SSBD controlled in MSR_SPEC_CTRL */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); + } + } + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculative_store_bypass_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -312,32 +422,289 @@ retpoline_auto: } #undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char *ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses + * a completely different MSR and bit dependent on family. + */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + break; + case X86_VENDOR_AMD: + x86_amd_ssb_disable(); + break; + } + } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + bool update; + + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + default: + return -ERANGE; + } + + /* + * If being set on non-current task, delay setting the CPU + * mitigation until it is next scheduled. + */ + if (task == current && update) + speculative_store_bypass_update_current(); + + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) { - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); - if (boot_cpu_has(X86_FEATURE_PTI)) - return sprintf(buf, "Mitigation: PTI\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + + case X86_BUG_SPECTRE_V2: + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + spectre_v2_module_string()); + + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + + default: + break; + } + return sprintf(buf, "Vulnerable\n"); } +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) - return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - return sprintf(buf, "Not affected\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - spectre_v2_module_string()); +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 54d04d574148..38354c66df81 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -20,6 +20,8 @@ #include <asm/amd_nb.h> #include <asm/smp.h> +#include "cpu.h" + #define LVL_1_INST 1 #define LVL_1_DATA 2 #define LVL_2 3 @@ -637,6 +639,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + if (c->x86 < 0x17) { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } else if (c->x86 == 0x17 && + c->x86_model >= 0 && c->x86_model <= 0x1F) { + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* + * LLC ID is calculated from the number of threads sharing the + * cache. + * */ + u32 eax, ebx, ecx, edx, num_sharing_cache = 0; + u32 llc_index = find_num_cache_leaves(c) - 1; + + cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); + if (eax) + num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + + if (num_sharing_cache) { + int bits = get_count_order(num_sharing_cache) - 1; + + per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + } + } +} + void init_amd_cacheinfo(struct cpuinfo_x86 *c) { @@ -650,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) } } -unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) +void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -802,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); - return l2; + if (!l2) + cpu_detect_cache_sizes(c); } static int __cache_amd_cpumap_setup(unsigned int cpu, int index, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e5ec0f11c0de..14433ff5b828 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -18,6 +18,13 @@ #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c) } } +static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -128,6 +160,24 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, 0*32+31); #endif early_init_centaur(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + switch (c->x86) { #ifdef CONFIG_X86_32 case 5: @@ -199,6 +249,9 @@ static void init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif + + if (cpu_has(c, X86_FEATURE_VMX)) + centaur_detect_vmx_virtcap(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ce243f7d2d4e..95c8e507580d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -577,6 +584,19 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } +void detect_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + c->x86_max_cores = 1; + if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) + return; + + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + c->x86_max_cores = (eax >> 26) + 1; +} + void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -757,17 +777,32 @@ static void init_speculation_control(struct cpuinfo_x86 *c) * and they also have a different bit for STIBP support. Also, * a hypervisor might have set the individual AMD bits even on * Intel CPUs, for finer-grained selection of what's available. - * - * We use the AMD bits in 0x8000_0008 EBX as the generic hardware - * features, which are visible in /proc/cpuinfo and used by the - * kernel. So set those accordingly from the Intel bits. */ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { set_cpu_cap(c, X86_FEATURE_IBRS); set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || + cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } } void get_cpu_cap(struct cpuinfo_x86 *c) @@ -927,21 +962,47 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { {} }; -static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +/* Only list CPUs which speculate but are non susceptible to SSB */ +static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + { X86_VENDOR_AMD, 0x12, }, + { X86_VENDOR_AMD, 0x11, }, + { X86_VENDOR_AMD, 0x10, }, + { X86_VENDOR_AMD, 0xf, }, + {} +}; + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; - if (x86_match_cpu(cpu_no_meltdown)) - return false; + if (x86_match_cpu(cpu_no_speculation)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + if (!x86_match_cpu(cpu_no_spec_store_bypass) && + !(ia32_cap & ARCH_CAP_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (x86_match_cpu(cpu_no_meltdown)) + return; + /* Rogue Data Cache Load? No! */ if (ia32_cap & ARCH_CAP_RDCL_NO) - return false; + return; - return true; + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); } /* @@ -992,12 +1053,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (!x86_match_cpu(cpu_no_speculation)) { - if (cpu_vulnerable_to_meltdown(c)) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - } + cpu_set_bug_bits(c); fpu__init_system(c); @@ -1008,6 +1064,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ setup_clear_cpu_cap(X86_FEATURE_PCID); #endif + + /* + * Later in the boot process pgtable_l5_enabled() relies on + * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not + * enabled by this point we need to clear the feature bit to avoid + * false-positives at the later stage. + * + * pgtable_l5_enabled() can be false here for several reasons: + * - 5-level paging is disabled compile-time; + * - it's 32-bit kernel; + * - machine doesn't support 5-level paging; + * - user specified 'no5lvl' in kernel command line. + */ + if (!pgtable_l5_enabled()) + setup_clear_cpu_cap(X86_FEATURE_LA57); } void __init early_cpu_init(void) @@ -1359,6 +1430,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #endif mtrr_ap_init(); validate_apic_and_package_id(c); + x86_spec_ctrl_setup_ap(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e806b11a99af..38216f678fc3 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,7 +47,19 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); +extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); + +extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +extern void x86_spec_ctrl_setup_ap(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 60d1897041da..eb75564f2d25 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -188,7 +188,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_IBPB); setup_clear_cpu_cap(X86_FEATURE_STIBP); setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); } /* @@ -453,24 +456,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -/* - * find out the number of processor cores on the die - */ -static int intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return 1; - - /* Intel has a non-standard dependency on %ecx for this CPUID level. */ - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - return (eax >> 26) + 1; - else - return 1; -} - static void detect_vmx_virtcap(struct cpuinfo_x86 *c) { /* Intel VMX MSR indicated features */ @@ -653,8 +638,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) static void init_intel(struct cpuinfo_x86 *c) { - unsigned int l2 = 0; - early_init_intel(c); intel_workarounds(c); @@ -671,19 +654,13 @@ static void init_intel(struct cpuinfo_x86 *c) * let's use the legacy cpuid vector 0x1 and 0x4 for topology * detection. */ - c->x86_max_cores = intel_num_cpu_cores(c); + detect_num_cpu_cores(c); #ifdef CONFIG_X86_32 detect_ht(c); #endif } - l2 = init_intel_cacheinfo(c); - - /* Detect legacy cache sizes if init_intel_cacheinfo did not */ - if (l2 == 0) { - cpu_detect_cache_sizes(c); - l2 = c->x86_cache_size; - } + init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -696,7 +673,8 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (boot_cpu_has(X86_FEATURE_DS)) { - unsigned int l1; + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) set_cpu_cap(c, X86_FEATURE_BTS); @@ -724,6 +702,7 @@ static void init_intel(struct cpuinfo_x86 *c) * Dixon is NOT a Celeron. */ if (c->x86 == 6) { + unsigned int l2 = c->x86_cache_size; char *p = NULL; switch (c->x86_model) { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 475cb4f5f14f..c805a06e14c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -48,7 +48,7 @@ static struct dentry *dfs_inj; static u8 n_banks; -#define MAX_FLAG_OPT_SIZE 3 +#define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 enum injection_type { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 42cf2880d0ed..cd76380af79f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1727,6 +1727,21 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) } } +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + /* + * All newer Centaur CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || + c->x86 > 6) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1739,6 +1754,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_amd_feature_init(c); break; } + case X86_VENDOR_CENTAUR: + mce_centaur_feature_init(c); + break; default: break; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f7666eef4a87..f591b01930db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = { [SMCA_SMU] = { "smu", "System Management Unit" }, }; +static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = +{ + [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } +}; + const char *smca_get_name(enum smca_bank_types t) { if (t >= N_SMCA_BANK_TYPES) @@ -431,8 +436,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, - unsigned int block) +static u32 smca_get_block_address(unsigned int bank, unsigned int block) { u32 low, high; u32 addr = 0; @@ -443,24 +447,30 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); + /* Check our cache first: */ + if (smca_bank_addrs[bank][block] != -1) + return smca_bank_addrs[bank][block]; + /* * For SMCA enabled processors, BLKPTR field of the first MISC register * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). */ - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return addr; + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + goto out; if (!(low & MCI_CONFIG_MCAX)) - return addr; + goto out; - if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +out: + smca_bank_addrs[bank][block] = addr; return addr; } -static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, +static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -468,20 +478,8 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) return addr; - /* Get address from already initialized block. */ - if (per_cpu(threshold_banks, cpu)) { - struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank]; - - if (bankp && bankp->blocks) { - struct threshold_block *blockp = &bankp->blocks[block]; - - if (blockp) - return blockp->address; - } - } - if (mce_flags.smca) - return smca_get_block_address(cpu, bank, block); + return smca_get_block_address(bank, block); /* Fall back to method we used for older processors: */ switch (block) { @@ -559,7 +557,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); + address = get_block_address(address, low, high, bank, block); if (!address) break; @@ -1176,7 +1174,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - address = get_block_address(cpu, address, low, high, bank, ++block); + address = get_block_address(address, low, high, bank, ++block); if (!address) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index ad9e5ed81181..2ad9107ee980 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o cleanup.o +obj-y := mtrr.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 7468de429087..9a19c800fe40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -46,6 +46,7 @@ #include <linux/pci.h> #include <linux/smp.h> #include <linux/syscore_ops.h> +#include <linux/rcupdate.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> @@ -100,7 +101,7 @@ static int have_wrcomb(void) if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && dev->revision <= 5) { - pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } @@ -110,7 +111,7 @@ static int have_wrcomb(void) */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } @@ -312,24 +313,24 @@ int mtrr_add_page(unsigned long base, unsigned long size, return error; if (type >= MTRR_NUM_TYPES) { - pr_warn("mtrr: type: %u invalid\n", type); + pr_warn("type: %u invalid\n", type); return -EINVAL; } /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - pr_warn("mtrr: your processor doesn't support write-combining\n"); + pr_warn("your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - pr_warn("mtrr: zero sized request\n"); + pr_warn("zero sized request\n"); return -EINVAL; } if ((base | (base + size - 1)) >> (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { - pr_warn("mtrr: base or size exceeds the MTRR width\n"); + pr_warn("base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -360,8 +361,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, } else if (types_compatible(type, ltype)) continue; } - pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, + pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase, lsize); goto out; } @@ -369,7 +369,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (ltype != type) { if (types_compatible(type, ltype)) continue; - pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), mtrr_attrib_to_str(type)); goto out; @@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, } } } else { - pr_info("mtrr: no more MTRRs available\n"); + pr_info("no more MTRRs available\n"); } error = i; out: @@ -407,8 +407,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - pr_warn("mtrr: size and base must be multiples of 4 kiB\n"); - pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warn("size and base must be multiples of 4 kiB\n"); + pr_debug("size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -499,22 +499,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + pr_debug("no MTRR for %lx000,%lx000 found\n", base, size); goto out; } } if (reg >= max) { - pr_warn("mtrr: register: %d too big\n", reg); + pr_warn("register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - pr_warn("mtrr: MTRR %d not used\n", reg); + pr_warn("MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - pr_warn("mtrr: reg: %d has count=0\n", reg); + pr_warn("reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -775,7 +775,7 @@ void __init mtrr_bp_init(void) } if (!mtrr_enabled()) { - pr_info("MTRR: Disabled\n"); + pr_info("Disabled\n"); /* * PAT initialization relies on MTRR's rendezvous handler. @@ -793,6 +793,9 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; + + rcu_cpu_starting(smp_processor_id()); + /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index b099024d339c..81c0afb39d0a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -27,7 +27,7 @@ * exists, use it for populating initial_apicid and cpu topology * detection. */ -void detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; @@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c) static bool printed; if (c->cpuid_level < 0xb) - return; + return -1; cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); @@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c) * check if the cpuid leaf 0xb is actually implemented. */ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return; + return -1; set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); @@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id); printed = 1; } - return; #endif + return 0; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 18fa9d74c182..666a284116ac 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,11 +22,14 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> +#define OPCODE_BUFSIZE 64 + int panic_on_unrecovered_nmi; int panic_on_io_nmi; -static unsigned int code_bytes = 64; static int die_counter; +static struct pt_regs exec_summary_regs; + bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info) { @@ -69,9 +72,62 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +/* + * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: + * + * In case where we don't have the exact kernel image (which, if we did, we can + * simply disassemble and navigate to the RIP), the purpose of the bigger + * prologue is to have more context and to be able to correlate the code from + * the different toolchains better. + * + * In addition, it helps in recreating the register allocation of the failing + * kernel and thus make sense of the register dump. + * + * What is more, the additional complication of a variable length insn arch like + * x86 warrants having longer byte sequence before rIP so that the disassembler + * can "sync" up properly and find instruction boundaries when decoding the + * opcode bytes. + * + * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random + * guesstimate in attempt to achieve all of the above. + */ +void show_opcodes(u8 *rip, const char *loglvl) +{ + unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; + u8 opcodes[OPCODE_BUFSIZE]; + u8 *ip; + int i; + + printk("%sCode: ", loglvl); + + ip = (u8 *)rip - code_prologue; + if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) { + pr_cont("Bad RIP value.\n"); + return; + } + + for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) { + if (ip == rip) + pr_cont("<%02x> ", opcodes[i]); + else + pr_cont("%02x ", opcodes[i]); + } + pr_cont("\n"); +} + +void show_ip(struct pt_regs *regs, const char *loglvl) +{ +#ifdef CONFIG_X86_32 + printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); +#else + printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); +#endif + show_opcodes((u8 *)regs->ip, loglvl); +} + void show_iret_regs(struct pt_regs *regs) { - printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, regs->sp, regs->flags); } @@ -267,7 +323,6 @@ unsigned long oops_begin(void) bust_spinlocks(1); return flags; } -EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); void __noreturn rewind_stack_do_exit(int signr); @@ -287,6 +342,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) raw_local_irq_restore(flags); oops_exit(); + /* Executive summary in case the oops scrolled away */ + __show_regs(&exec_summary_regs, true); + if (!signr) return; if (in_interrupt()) @@ -305,10 +363,10 @@ NOKPROBE_SYMBOL(oops_end); int __die(const char *str, struct pt_regs *regs, long err) { -#ifdef CONFIG_X86_32 - unsigned short ss; - unsigned long sp; -#endif + /* Save the regs of the first oops for the executive summary later. */ + if (!die_counter) + exec_summary_regs = *regs; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", @@ -318,26 +376,13 @@ int __die(const char *str, struct pt_regs *regs, long err) IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); + show_regs(regs); + print_modules(); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; - print_modules(); - show_regs(regs); -#ifdef CONFIG_X86_32 - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss; - } else { - sp = kernel_stack_pointer(regs); - savesegment(ss, ss); - } - printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", - (void *)regs->ip, ss, sp); -#else - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); -#endif return 0; } NOKPROBE_SYMBOL(__die); @@ -356,30 +401,9 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init code_bytes_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - - code_bytes = val; - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); - void show_regs(struct pt_regs *regs) { bool all = true; - int i; show_regs_print_info(KERN_DEFAULT); @@ -389,36 +413,8 @@ void show_regs(struct pt_regs *regs) __show_regs(regs, all); /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. + * When in-kernel, we also print out the stack at the time of the fault.. */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - + if (!user_mode(regs)) show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - - printk(KERN_DEFAULT "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - pr_cont(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - pr_cont("<%02x> ", c); - else - pr_cont("%02x ", c); - } - } - pr_cont("\n"); } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6a2cb1442e05..d1f25c831447 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si int x = table->nr_entries; if (x >= ARRAY_SIZE(table->entries)) { - pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); + pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", + start, start + size - 1); return; } @@ -190,9 +191,10 @@ void __init e820__print_table(char *who) int i; for (i = 0; i < e820_table->nr_entries; i++) { - pr_info("%s: [mem %#018Lx-%#018Lx] ", who, - e820_table->entries[i].addr, - e820_table->entries[i].addr + e820_table->entries[i].size - 1); + pr_info("%s: [mem %#018Lx-%#018Lx] ", + who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); e820_print_type(e820_table->entries[i].type); pr_cont("\n"); @@ -574,7 +576,7 @@ void __init e820__update_table_print(void) if (e820__update_table(e820_table)) return; - pr_info("e820: modified physical RAM map:\n"); + pr_info("modified physical RAM map:\n"); e820__print_table("modified"); } @@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void) if (!found) { #ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - pr_err( - "e820: Cannot find an available gap in the 32-bit address range\n" - "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); + pr_err("Cannot find an available gap in the 32-bit address range\n"); + pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else gapstart = 0x10000000; #endif @@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void) */ pci_mem_start = gapstart; - pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); + pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); } /* @@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); early_memunmap(sdata, data_len); - pr_info("e820: extended physical RAM map:\n"); + pr_info("extended physical RAM map:\n"); e820__print_table("extended"); } @@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); - pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", - last_pfn, max_arch_pfn); + pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); return last_pfn; } @@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void) if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - pr_info("e820: user-defined physical RAM map:\n"); + pr_info("user-defined physical RAM map:\n"); e820__print_table("user"); } } @@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void) memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); - pr_info("e820: BIOS-provided physical RAM map:\n"); + pr_info("BIOS-provided physical RAM map:\n"); e820__print_table(who); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bae0d32e327b..da5d8ac60062 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -28,8 +28,6 @@ #include <asm/irq_remapping.h> #include <asm/early_ioremap.h> -#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) - static void __init fix_hypertransport_config(int num, int slot, int func) { u32 htcfg; @@ -617,7 +615,8 @@ static void __init apple_airport_reset(int bus, int slot, int func) pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { - dev_err("Cannot power up Apple AirPort card\n"); + pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n", + bus, slot, func); return; } } @@ -628,7 +627,8 @@ static void __init apple_airport_reset(int bus, int slot, int func) mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); if (!mmio) { - dev_err("Cannot iomap Apple AirPort card\n"); + pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n", + bus, slot, func); return; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0c408f8c4ed4..a21d6ace648e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -6,6 +6,10 @@ */ #define DISABLE_BRANCH_PROFILING + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> @@ -32,11 +36,6 @@ #include <asm/microcode.h> #include <asm/kasan.h> -#ifdef CONFIG_X86_5LEVEL -#undef pgtable_l5_enabled -#define pgtable_l5_enabled __pgtable_l5_enabled -#endif - /* * Manage page tables very early on. */ @@ -45,8 +44,7 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled __ro_after_init; -EXPORT_SYMBOL(__pgtable_l5_enabled); +unsigned int __pgtable_l5_enabled __initdata; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; @@ -82,13 +80,14 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { - if (native_cpuid_eax(0) < 7) - return false; - - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + /* + * 5-level paging is detected and enabled at kernel decomression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) return false; - *fixup_int(&pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; *fixup_int(&pgdir_shift, physaddr) = 48; *fixup_int(&ptrs_per_p4d, physaddr) = 512; *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; @@ -104,6 +103,12 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +/* Code in __startup_64() can be relocated during execution, but the compiler + * doesn't have to generate PC-relative relocations when accessing globals from + * that function. Clang actually does not generate them, which leads to + * boot-time crashes. To work around this problem, every global pointer must + * be adjusted using fixup_pointer(). + */ unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { @@ -113,6 +118,7 @@ unsigned long __head __startup_64(unsigned long physaddr, p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; + pteval_t *mask_ptr; bool la57; int i; unsigned int *next_pgt_ptr; @@ -196,7 +202,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; /* Filter out unsupported __PAGE_KERNEL_* bits: */ - pmd_entry &= __supported_pte_mask; + mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); + pmd_entry &= *mask_ptr; pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; @@ -273,7 +280,7 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8ce4212e2b8d..b6be34ee88e9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -975,8 +975,7 @@ int __init hpet_enable(void) cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); if (cfg) - pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", - cfg); + pr_warn("Unrecognized bits %#x set in global cfg\n", cfg); for (i = 0; i <= last; ++i) { cfg = hpet_readl(HPET_Tn_CFG(i)); @@ -988,7 +987,7 @@ int __init hpet_enable(void) | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | HPET_TN_FSB | HPET_TN_FSB_CAP); if (cfg) - pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", + pr_warn("Unrecognized bits %#x set in cfg#%u\n", cfg, i); } hpet_print_config(); diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index a15fe0e92cf9..108c48d0d40e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -37,7 +37,7 @@ static uint32_t __init jailhouse_detect(void) return jailhouse_cpuid_base(); } -static void jailhouse_get_wallclock(struct timespec *now) +static void jailhouse_get_wallclock(struct timespec64 *now) { memset(now, 0, sizeof(*now)); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0715f827607c..6f4d42377fe5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return 0; + #ifdef CONFIG_X86_64 /* Only x86_64 has RIP relative instructions */ if (insn_rip_relative(insn)) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 8b26c9e01cc4..bf8d1eb7fca3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -53,7 +53,7 @@ static struct pvclock_wall_clock *wall_clock; * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -static void kvm_get_wallclock(struct timespec *now) +static void kvm_get_wallclock(struct timespec64 *now) { struct pvclock_vcpu_time_info *vcpu_time; int low, high; @@ -72,7 +72,7 @@ static void kvm_get_wallclock(struct timespec *now) put_cpu(); } -static int kvm_set_wallclock(const struct timespec *now) +static int kvm_set_wallclock(const struct timespec64 *now) { return -ENODEV; } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 60cdec6628b0..d1ab07ec8c9a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -57,12 +57,17 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { free_page((unsigned long)image->arch.pgd); + image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); + image->arch.pmd0 = NULL; free_page((unsigned long)image->arch.pmd1); + image->arch.pmd1 = NULL; #endif free_page((unsigned long)image->arch.pte0); + image->arch.pte0 = NULL; free_page((unsigned long)image->arch.pte1); + image->arch.pte1 = NULL; } static int machine_kexec_alloc_page_tables(struct kimage *image) @@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) !image->arch.pmd0 || !image->arch.pmd1 || #endif !image->arch.pte0 || !image->arch.pte1) { - machine_kexec_free_page_tables(image); return -ENOMEM; } return 0; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a5e55d832d0a..4c8acdfdc5a7 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -39,9 +39,13 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); + image->arch.p4d = NULL; free_page((unsigned long)image->arch.pud); + image->arch.pud = NULL; free_page((unsigned long)image->arch.pmd); + image->arch.pmd = NULL; free_page((unsigned long)image->arch.pte); + image->arch.pte = NULL; } static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) @@ -91,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: - free_transition_pgtable(image); return result; } @@ -351,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); - VMCOREINFO_NUMBER(pgtable_l5_enabled); + vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", + pgtable_l5_enabled()); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 77625b60a510..ab5d9dd668d2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,13 +15,11 @@ #include <asm/x86_init.h> #include <asm/iommu_table.h> -static int forbid_dac __read_mostly; +static bool disable_dac_quirk __read_mostly; const struct dma_map_ops *dma_ops = &dma_direct_ops; EXPORT_SYMBOL(dma_ops); -static int iommu_sac_force __read_mostly; - #ifdef CONFIG_IOMMU_DEBUG int panic_on_overflow __read_mostly = 1; int force_iommu __read_mostly = 1; @@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = { }; EXPORT_SYMBOL(x86_dma_fallback_dev); -/* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES 65536 - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void) } } -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) +bool arch_dma_alloc_attrs(struct device **dev) { if (!*dev) *dev = &x86_dma_fallback_dev; @@ -125,13 +120,13 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "nomerge", 7)) iommu_merge = 0; if (!strncmp(p, "forcesac", 8)) - iommu_sac_force = 1; + pr_warn("forcesac option ignored.\n"); if (!strncmp(p, "allowdac", 8)) - forbid_dac = 0; + pr_warn("allowdac option ignored.\n"); if (!strncmp(p, "nodac", 5)) - forbid_dac = 1; + pr_warn("nodac option ignored.\n"); if (!strncmp(p, "usedac", 6)) { - forbid_dac = -1; + disable_dac_quirk = true; return 1; } #ifdef CONFIG_SWIOTLB @@ -156,40 +151,9 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int arch_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI - if (mask > 0xffffffff && forbid_dac > 0) { - dev_info(dev, "PCI: Disallowing DAC for device\n"); - return 0; - } -#endif - - /* Tell the device to use SAC when IOMMU force is on. This - allows the driver to use cheaper accesses in some cases. - - Problem with this is that if we overflow the IOMMU area and - return DAC as fallback address the device may not handle it - correctly. - - As a special case some controllers have a 39bit address - mode that is as efficient as 32bit (aic79xx). Don't force - SAC for these. Assume all masks <= 40 bits are of this - type. Normally this doesn't make any difference, but gives - more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { - dev_info(dev, "Force SAC with mask %Lx\n", mask); - return 0; - } - - return 1; -} -EXPORT_SYMBOL(arch_dma_supported); - static int __init pci_iommu_init(void) { struct iommu_table_entry *p; - dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); @@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.dma_32bit_limit = true; + return 0; +} + static void via_no_dac(struct pci_dev *dev) { - if (forbid_dac == 0) { + if (!disable_dac_quirk) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); - forbid_dac = 1; + pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL); } } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index e47b2dbbdef3..c06c4c16c6b6 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -151,17 +151,19 @@ void perf_get_regs_user(struct perf_regs *regs_user, regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = user_regs->cs; regs_user_copy->ss = user_regs->ss; - /* - * Most system calls don't save these registers, don't report them. + * Store user space frame-pointer value on sample + * to facilitate stack unwinding for cases when + * user space executable code has such support + * enabled at compile time: */ + regs_user_copy->bp = user_regs->bp; + regs_user_copy->bx = -1; - regs_user_copy->bp = -1; regs_user_copy->r12 = -1; regs_user_copy->r13 = -1; regs_user_copy->r14 = -1; regs_user_copy->r15 = -1; - /* * For this to be at all useful, we need a reasonable guess for * the ABI. Be careful: we're in NMI context, and we're diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 03408b942adb..30ca2d1a9231 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,6 +38,7 @@ #include <asm/switch_to.h> #include <asm/desc.h> #include <asm/prctl.h> +#include <asm/spec-ctrl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -278,6 +279,148 @@ static inline void switch_to_bitmap(struct tss_struct *tss, } } +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themself to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +static __always_inline void intel_set_ssb_state(unsigned long tifn) +{ + u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); + + wrmsrl(MSR_IA32_SPEC_CTRL, msr); +} + +static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +{ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) + amd_set_ssb_virt_state(tifn); + else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + amd_set_core_ssb_state(tifn); + else + intel_set_ssb_state(tifn); +} + +void speculative_store_bypass_update(unsigned long tif) +{ + preempt_disable(); + __speculative_store_bypass_update(tif); + preempt_enable(); +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { @@ -309,6 +452,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + + if ((tifp ^ tifn) & _TIF_SSBD) + __speculative_store_bypass_update(tifn); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5224c6099184..0ae659de21eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); - printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, - raw_smp_processor_id()); + show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); if (!all) return; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4b100fe0f508..12bb445fb98d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -542,6 +542,7 @@ void set_personality_64bit(void) clear_thread_flag(TIF_X32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; + current_thread_info()->status &= ~TS_COMPAT; /* Ensure the corresponding mm is not marked. */ if (current->mm) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index ed5c4cdf0a34..e2ee403865eb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk, tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; - memset(info, 0, sizeof(*info)); info->si_signo = SIGTRAP; info->si_code = si_code; info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; @@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, { struct siginfo info; + clear_siginfo(&info); fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 761f6af6efa5..637982efecd8 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -123,28 +123,35 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, - struct timespec *ts) + struct timespec64 *ts) { u32 version; u64 delta; - struct timespec now; + struct timespec64 now; /* get wallclock at system boot */ do { version = wall_clock->version; rmb(); /* fetch version before time */ + /* + * Note: wall_clock->sec is a u32 value, so it can + * only store dates between 1970 and 2106. To allow + * times beyond that, we need to create a new hypercall + * interface with an extended pvclock_wall_clock structure + * like ARM has. + */ now.tv_sec = wall_clock->sec; now.tv_nsec = wall_clock->nsec; rmb(); /* fetch time before checking version */ } while ((wall_clock->version & 1) || (version != wall_clock->version)); delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; now.tv_nsec = do_div(delta, NSEC_PER_SEC); now.tv_sec = delta; - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec); } void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index f7b82ed7b5b5..586f718b8e95 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL(rtc_lock); * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec *now) +int mach_set_rtc_mmss(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -60,7 +60,7 @@ int mach_set_rtc_mmss(const struct timespec *now) return retval; } -void mach_get_cmos_time(struct timespec *now) +void mach_get_cmos_time(struct timespec64 *now) { unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; @@ -118,7 +118,7 @@ void mach_get_cmos_time(struct timespec *now) } else year += CMOS_YEARS_OFFS; - now->tv_sec = mktime(year, mon, day, hour, min, sec); + now->tv_sec = mktime64(year, mon, day, hour, min, sec); now->tv_nsec = 0; } @@ -145,13 +145,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -int update_persistent_clock(struct timespec now) +int update_persistent_clock64(struct timespec64 now) { return x86_platform.set_wallclock(&now); } /* not static: needed by APM */ -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { x86_platform.get_wallclock(ts); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 14c057f29979..9ccbf0576cd0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 7); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); BUILD_BUG_ON(NSIGSYS != 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0f1cbb042f49..c2f7d1d2a5c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -79,13 +79,7 @@ #include <asm/qspinlock.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> - -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); - -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; +#include <asm/spec-ctrl.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -244,6 +238,8 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); + speculative_store_bypass_ht_init(); + /* * Lock vector_lock, set CPU online and bring the vector * allocator online. Online must be set with vector_lock held @@ -1292,6 +1288,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_mtrr_aps_delayed_init(); smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); } void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a3f15ed545b5..6a78d4b36a79 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/mm.h> @@ -19,7 +20,6 @@ #include <linux/elf.h> #include <asm/elf.h> -#include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> #include <asm/mpx.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 03f3d7695dac..a535dd64de63 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); + clear_siginfo(&info); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.trap_nr = trapnr; task->thread.error_code = error_code; + clear_siginfo(&info); info.si_signo = SIGFPE; info.si_errno = 0; info.si_addr = (void __user *)uprobe_get_trap_addr(regs); @@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); + clear_siginfo(&info); info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f44ce0fb3583..ff20b35e98dd 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; + clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_MAPERR; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 85c7ef23d99f..58d8d800875d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -299,6 +299,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool if (is_prefix_bad(insn)) return -ENOTSUPP; + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return -ENOTSUPP; + if (x86_64) good_insns = good_insns_64; else @@ -1079,8 +1083,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; if (nleft != rasize) { - pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " - "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); + pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", + current->pid, regs->sp, regs->ip); force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 795f3a80e576..5e1458f609a1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -117,11 +117,11 @@ SECTIONS #ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); - VMLINUX_SYMBOL(__entry_trampoline_start) = .; + __entry_trampoline_start = .; _entry_trampoline = .; *(.entry_trampoline) . = ALIGN(PAGE_SIZE); - VMLINUX_SYMBOL(__entry_trampoline_end) = .; + __entry_trampoline_end = .; ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); #endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b90a8b3..92bf2f2e7cdd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(IBPB) | F(IBRS); + F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -408,7 +408,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(ARCH_CAPABILITIES); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); } else { entry->ebx = 0; entry->ecx = 0; @@ -647,13 +652,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBRS and IBPB aren't necessarily present in hardware cpuid */ - if (boot_cpu_has(X86_FEATURE_IBPB)) - entry->ebx |= F(IBPB); - if (boot_cpu_has(X86_FEATURE_IBRS)) - entry->ebx |= F(IBRS); + /* + * IBRS, IBPB and VIRT_SSBD aren't necessarily present in + * hardware cpuid + */ + if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + entry->ebx |= F(AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + entry->ebx |= F(AMD_IBRS); + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + entry->ebx |= F(VIRT_SSBD); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + entry->ebx |= F(VIRT_SSBD); break; } case 0x80000019: diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5708e951a5c6..46ff64da44ca 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1260,14 +1260,18 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) } } -static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { - struct kvm_run *run = vcpu->run; - - kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); + kvm_hv_hypercall_set_result(vcpu, result); + ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } +static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); +} + static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) { struct eventfd_ctx *eventfd; @@ -1350,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) /* Hypercall continuation is not supported yet */ if (rep_cnt || rep_idx) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; - goto set_result; + goto out; } switch (code) { @@ -1381,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; } -set_result: - kvm_hv_hypercall_set_result(vcpu, ret); - return 1; +out: + return kvm_hv_hypercall_complete(vcpu, ret); } void kvm_hv_init_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b74c9c1405b9..3773c4625114 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1522,11 +1522,23 @@ static bool set_target_expiration(struct kvm_lapic *apic) static void advance_periodic_target_expiration(struct kvm_lapic *apic) { - apic->lapic_timer.tscdeadline += - nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + ktime_t now = ktime_get(); + u64 tscl = rdtsc(); + ktime_t delta; + + /* + * Synchronize both deadlines to the same time source or + * differences in the periods (caused by differences in the + * underlying clocks or numerical approximation errors) will + * cause the two to drift apart over time as the errors + * accumulate. + */ apic->lapic_timer.target_expiration = ktime_add_ns(apic->lapic_timer.target_expiration, apic->lapic_timer.period); + delta = ktime_sub(apic->lapic_timer.target_expiration, now); + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8494dbae41b9..d634f0332c0f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3007,6 +3007,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * { siginfo_t info; + clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_MCEERR_AR; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1fc05e428aba..26110c202b19 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -49,7 +49,7 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/virtext.h> #include "trace.h" @@ -213,6 +213,12 @@ struct vcpu_svm { } host; u64 spec_ctrl; + /* + * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be + * translated into the appropriate L2_CFG bits on the host to + * perform speculative control. + */ + u64 virt_spec_ctrl; u32 *msrpm; @@ -2060,6 +2066,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.microcode_version = 0x01000065; svm->spec_ctrl = 0; + svm->virt_spec_ctrl = 0; if (!init_event) { svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | @@ -4108,11 +4115,18 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) return 1; msr_info->data = svm->spec_ctrl; break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + return 1; + + msr_info->data = svm->virt_spec_ctrl; + break; case MSR_F15H_IC_CFG: { int family, model; @@ -4203,7 +4217,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ @@ -4230,7 +4244,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) return 1; if (data & ~PRED_CMD_IBPB) @@ -4244,6 +4258,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + return 1; + + if (data & ~SPEC_CTRL_SSBD) + return 1; + + svm->virt_spec_ctrl = data; + break; case MSR_STAR: svm->vmcb->save.star = data; break; @@ -5557,8 +5581,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); asm volatile ( "push %%" _ASM_BP "; \n\t" @@ -5652,6 +5675,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + /* * We do not use IBRS in the kernel. If this vCPU has used the * SPEC_CTRL MSR it may have left it on; save the value and @@ -5670,20 +5705,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); reload_tss(vcpu); @@ -5786,7 +5808,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_high_real_mode_segbase(void) +static bool svm_has_emulated_msr(int index) { return true; } @@ -7012,7 +7034,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, - .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, + .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3f1696570b41..40aa29204baf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,7 +51,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> #include <asm/mmu_context.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/mshyperv.h> #include "trace.h" @@ -3529,7 +3529,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -3648,12 +3647,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) return 1; vmx->spec_ctrl = data; @@ -3679,7 +3677,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -9488,9 +9485,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); -static bool vmx_has_high_real_mode_segbase(void) +static bool vmx_has_emulated_msr(int index) { - return enable_unrestricted_guest || emulate_invalid_guest_state; + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } } static bool vmx_mpx_supported(void) @@ -9722,8 +9731,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); vmx->__launched = vmx->loaded_vmcs->launched; @@ -9871,8 +9879,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -12632,7 +12639,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, - .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, + .has_emulated_msr = vmx_has_emulated_msr, .vm_init = vmx_vm_init, .vm_alloc = vmx_vm_alloc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59371de5d722..71e7cda6d014 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1061,6 +1061,7 @@ static u32 emulated_msrs[] = { MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, }; static unsigned num_emulated_msrs; @@ -2906,7 +2907,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); @@ -4606,14 +4607,8 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - switch (emulated_msrs[i]) { - case MSR_IA32_SMBASE: - if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) - continue; - break; - default: - break; - } + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + continue; if (j < i) emulated_msrs[j] = emulated_msrs[i]; @@ -6676,11 +6671,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit; - if (kvm_hv_hypercall_enabled(vcpu->kvm)) { - if (!kvm_hv_hypercall(vcpu)) - return 0; - goto out; - } + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); @@ -6701,7 +6693,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_x86_ops->get_cpl(vcpu) != 0) { ret = -KVM_EPERM; - goto out_error; + goto out; } switch (nr) { @@ -6721,12 +6713,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } -out_error: +out: if (!op_64_bit) ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); -out: ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } @@ -7985,6 +7976,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; + int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; int ret = -EINVAL; @@ -8023,8 +8015,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & + (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + if (cpuid_update_needed) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 9a53a06e5a3e..c3b527a9f95d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -184,11 +184,11 @@ ENDPROC(memcpy_orig) #ifndef CONFIG_UML /* - * memcpy_mcsafe_unrolled - memory copy with machine check exception handling + * __memcpy_mcsafe - memory copy with machine check exception handling * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(memcpy_mcsafe_unrolled) +ENTRY(__memcpy_mcsafe) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -204,58 +204,29 @@ ENTRY(memcpy_mcsafe_unrolled) subl $8, %ecx negl %ecx subl %ecx, %edx -.L_copy_leading_bytes: +.L_read_leading_bytes: movb (%rsi), %al +.L_write_leading_bytes: movb %al, (%rdi) incq %rsi incq %rdi decl %ecx - jnz .L_copy_leading_bytes + jnz .L_read_leading_bytes .L_8byte_aligned: - /* Figure out how many whole cache lines (64-bytes) to copy */ - movl %edx, %ecx - andl $63, %edx - shrl $6, %ecx - jz .L_no_whole_cache_lines - - /* Loop copying whole cache lines */ -.L_cache_w0: movq (%rsi), %r8 -.L_cache_w1: movq 1*8(%rsi), %r9 -.L_cache_w2: movq 2*8(%rsi), %r10 -.L_cache_w3: movq 3*8(%rsi), %r11 - movq %r8, (%rdi) - movq %r9, 1*8(%rdi) - movq %r10, 2*8(%rdi) - movq %r11, 3*8(%rdi) -.L_cache_w4: movq 4*8(%rsi), %r8 -.L_cache_w5: movq 5*8(%rsi), %r9 -.L_cache_w6: movq 6*8(%rsi), %r10 -.L_cache_w7: movq 7*8(%rsi), %r11 - movq %r8, 4*8(%rdi) - movq %r9, 5*8(%rdi) - movq %r10, 6*8(%rdi) - movq %r11, 7*8(%rdi) - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - decl %ecx - jnz .L_cache_w0 - - /* Are there any trailing 8-byte words? */ -.L_no_whole_cache_lines: movl %edx, %ecx andl $7, %edx shrl $3, %ecx jz .L_no_whole_words - /* Copy trailing words */ -.L_copy_trailing_words: +.L_read_words: movq (%rsi), %r8 - mov %r8, (%rdi) - leaq 8(%rsi), %rsi - leaq 8(%rdi), %rdi +.L_write_words: + movq %r8, (%rdi) + addq $8, %rsi + addq $8, %rdi decl %ecx - jnz .L_copy_trailing_words + jnz .L_read_words /* Any trailing bytes? */ .L_no_whole_words: @@ -264,38 +235,53 @@ ENTRY(memcpy_mcsafe_unrolled) /* Copy trailing bytes */ movl %edx, %ecx -.L_copy_trailing_bytes: +.L_read_trailing_bytes: movb (%rsi), %al +.L_write_trailing_bytes: movb %al, (%rdi) incq %rsi incq %rdi decl %ecx - jnz .L_copy_trailing_bytes + jnz .L_read_trailing_bytes /* Copy successful. Return zero */ .L_done_memcpy_trap: xorq %rax, %rax ret -ENDPROC(memcpy_mcsafe_unrolled) -EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) +ENDPROC(__memcpy_mcsafe) +EXPORT_SYMBOL_GPL(__memcpy_mcsafe) .section .fixup, "ax" - /* Return -EFAULT for any failure */ -.L_memcpy_mcsafe_fail: - mov $-EFAULT, %rax + /* + * Return number of bytes not copied for any failure. Note that + * there is no "tail" handling since the source buffer is 8-byte + * aligned and poison is cacheline aligned. + */ +.E_read_words: + shll $3, %ecx +.E_leading_bytes: + addl %edx, %ecx +.E_trailing_bytes: + mov %ecx, %eax ret + /* + * For write fault handling, given the destination is unaligned, + * we handle faults on multi-byte writes with a byte-by-byte + * copy up to the write-protected page. + */ +.E_write_words: + shll $3, %ecx + addl %edx, %ecx + movl %ecx, %edx + jmp mcsafe_handle_tail + .previous - _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) + _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE(.L_write_words, .E_write_words) + _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) #endif diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 75d3776123cc..9c5606d88f61 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -23,13 +23,13 @@ unsigned long __clear_user(void __user *addr, unsigned long size) asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" - "0: movq %[zero],(%[dst])\n" - " addq %[eight],%[dst]\n" + "0: movq $0,(%[dst])\n" + " addq $8,%[dst]\n" " decl %%ecx ; jnz 0b\n" "4: movq %[size1],%%rcx\n" " testl %%ecx,%%ecx\n" " jz 2f\n" - "1: movb %b[zero],(%[dst])\n" + "1: movb $0,(%[dst])\n" " incq %[dst]\n" " decl %%ecx ; jnz 1b\n" "2:\n" @@ -40,8 +40,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) _ASM_EXTABLE(0b,3b) _ASM_EXTABLE(1b,2b) : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), - [zero] "r" (0UL), [eight] "r" (8UL)); + : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); clac(); return size; } @@ -75,6 +74,27 @@ copy_user_handle_tail(char *to, char *from, unsigned len) return len; } +/* + * Similar to copy_user_handle_tail, probe for the write fault point, + * but reuse __memcpy_mcsafe in case a new read error is encountered. + * clac() is handled in _copy_to_iter_mcsafe(). + */ +__visible unsigned long +mcsafe_handle_tail(char *to, char *from, unsigned len) +{ + for (; len; --len, to++, from++) { + /* + * Call the assembly routine back directly since + * memcpy_mcsafe() may silently fallback to memcpy. + */ + unsigned long rem = __memcpy_mcsafe(to, from, 1); + + if (rem) + break; + } + return len; +} + #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index cc7ff5957194..2f3c9196b834 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -360,7 +360,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, void *pt) { if (__pa(pt) == __pa(kasan_zero_pmd) || - (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || + (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) || __pa(pt) == __pa(kasan_zero_pud)) { pgprotval_t prot = pte_flags(kasan_zero_pte[0]); note_page(m, st, __pgprot(prot), 0, 5); @@ -476,8 +476,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } } -#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) -#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) +#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) static inline bool is_hypervisor_range(int idx) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 73bd8c95ac71..9a84a0d08727 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -209,6 +209,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, unsigned lsb = 0; siginfo_t info; + clear_siginfo(&info); info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; @@ -439,7 +440,7 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_k)) return -1; - if (pgtable_l5_enabled) { + if (pgtable_l5_enabled()) { if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_k); arch_flush_lazy_mmu_mode(); @@ -454,7 +455,7 @@ static noinline int vmalloc_fault(unsigned long address) if (p4d_none(*p4d_k)) return -1; - if (p4d_none(*p4d) && !pgtable_l5_enabled) { + if (p4d_none(*p4d) && !pgtable_l5_enabled()) { set_p4d(p4d, *p4d_k); arch_flush_lazy_mmu_mode(); } else { @@ -828,6 +829,8 @@ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { + const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; + if (!unhandled_signal(tsk, SIGSEGV)) return; @@ -835,13 +838,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, + loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); printk(KERN_CONT "\n"); + + show_opcodes((u8 *)regs->ip, loglvl); } static void diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index a2f0c7e20fb0..fe7a12599d8e 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -123,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - if (pgtable_l5_enabled) { + if (pgtable_l5_enabled()) { set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0a400606dea0..17383f9677fa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -180,7 +180,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) */ void sync_global_pgds(unsigned long start, unsigned long end) { - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) sync_global_pgds_l5(start, end); else sync_global_pgds_l4(start, end); @@ -643,7 +643,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { @@ -723,7 +723,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) pgd_populate(&init_mm, pgd, p4d); else p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); @@ -1100,7 +1100,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables. */ - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) free_pud_table(pud_base, p4d); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 980dbebd0ca7..e3e77527f8df 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -2,10 +2,8 @@ #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt -#ifdef CONFIG_X86_5LEVEL -/* Too early to use cpu_feature_enabled() */ -#define pgtable_l5_enabled __pgtable_l5_enabled -#endif +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 #include <linux/bootmem.h> #include <linux/kasan.h> @@ -182,7 +180,7 @@ static void __init clear_pgds(unsigned long start, * With folded p4d, pgd_clear() is nop, use p4d_clear() * instead. */ - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) pgd_clear(pgd); else p4d_clear(p4d_offset(pgd, start)); @@ -197,7 +195,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) { unsigned long p4d; - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) return (p4d_t *)pgd; p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; @@ -284,7 +282,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++) + for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -315,7 +313,7 @@ void __init kasan_init(void) * bunch of things like kernel code, modules, EFI mapping, etc. * We need to take extra steps to not overwrite them. */ - if (pgtable_l5_enabled) { + if (pgtable_l5_enabled()) { void *ptr; ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 615cc03ced84..61db77b0eda9 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -78,7 +78,7 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; - vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; vaddr = vaddr_start; /* @@ -124,7 +124,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) entropy = (rand % (entropy + 1)) & P4D_MASK; else entropy = (rand % (entropy + 1)) & PUD_MASK; @@ -136,7 +136,7 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) vaddr = round_up(vaddr + 1, P4D_SIZE); else vaddr = round_up(vaddr + 1, PUD_SIZE); @@ -212,7 +212,7 @@ void __meminit init_trampoline(void) return; } - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) init_trampoline_p4d(); else init_trampoline_pud(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 25504d5aa816..fa150855647c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -136,13 +136,13 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, /* whine about and ignore invalid blks */ if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); return 0; } if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: too many memblk ranges\n"); + pr_err("too many memblk ranges\n"); return -EINVAL; } @@ -267,14 +267,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->end > bj->start && bi->start < bj->end) { if (bi->nid != bj->nid) { - pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", bi->nid, bi->start, bi->end - 1, bj->nid, bj->start, bj->end - 1); return -EINVAL; } - pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->start, bj->end - 1); + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); } /* @@ -364,7 +364,7 @@ static int __init numa_alloc_distance(void) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), size, PAGE_SIZE); if (!phys) { - pr_warning("NUMA: Warning: can't allocate distance table!\n"); + pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; @@ -410,14 +410,14 @@ void __init numa_set_distance(int from, int to, int distance) if (from >= numa_distance_cnt || to >= numa_distance_cnt || from < 0 || to < 0) { - pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", - from, to, distance); + pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); return; } if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", from, to, distance); return; } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index d7bc0eea20a5..6e98e0a7c923 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ if (pkey != -1) return pkey; - /* - * Look for a protection-key-drive execute-only mapping - * which is now being given permissions that are not - * execute-only. Move it back to the default pkey. - */ - if (vma_is_pkey_exec_only(vma) && - (prot & (PROT_READ|PROT_WRITE))) { - return 0; - } + /* * The mapping is execute-only. Go try to get the * execute-only protection key. If we fail to do that, * fall through as if we do not have execute-only - * support. + * support in this mm. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(vma->vm_mm); if (pkey > 0) return pkey; + } else if (vma_is_pkey_exec_only(vma)) { + /* + * Protections are *not* PROT_EXEC, but the mapping + * is using the exec-only pkey. This mapping was + * PROT_EXEC and will no longer be. Move back to + * the default pkey. + */ + return ARCH_DEFAULT_PKEY; } + /* * This is a vanilla, non-pkey mprotect (or we failed to * setup execute-only), inherit the pkey from the VMA we diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e055d1a06699..6eb1f34c3c85 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) unsigned long sp = current_stack_pointer; pgd_t *pgd = pgd_offset(mm, sp); - if (pgtable_l5_enabled) { + if (pgtable_l5_enabled()) { if (unlikely(pgd_none(*pgd))) { pgd_t *pgd_ref = pgd_offset_k(sp); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 263c8453815e..d765acedc05c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,4 +1,5 @@ -/* bpf_jit_comp.c : BPF JIT compiler +/* + * bpf_jit_comp.c: BPF JIT compiler * * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com @@ -17,7 +18,7 @@ #include <asm/nospec-branch.h> /* - * assembly code in arch/x86/net/bpf_jit.S + * Assembly code in arch/x86/net/bpf_jit.S */ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[]; @@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) + #define EMIT1_off32(b1, off) \ - do {EMIT1(b1); EMIT(off, 4); } while (0) + do { EMIT1(b1); EMIT(off, 4); } while (0) #define EMIT2_off32(b1, b2, off) \ - do {EMIT2(b1, b2); EMIT(off, 4); } while (0) + do { EMIT2(b1, b2); EMIT(off, 4); } while (0) #define EMIT3_off32(b1, b2, b3, off) \ - do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) + do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) #define EMIT4_off32(b1, b2, b3, b4, off) \ - do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) + do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) static bool is_imm8(int value) { @@ -70,9 +72,10 @@ static bool is_uimm32(u64 value) } /* mov dst, src */ -#define EMIT_mov(DST, SRC) \ - do {if (DST != SRC) \ - EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \ +#define EMIT_mov(DST, SRC) \ + do { \ + if (DST != SRC) \ + EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \ } while (0) static int bpf_size_to_x86_bytes(int bpf_size) @@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) return 0; } -/* list of x86 cond jumps opcodes (. + s8) +/* + * List of x86 cond jumps opcodes (. + s8) * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) */ #define X86_JB 0x72 @@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -/* pick a register outside of BPF range for JIT internal work */ +/* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) -/* The following table maps BPF registers to x64 registers. +/* + * The following table maps BPF registers to x86-64 registers. * - * x64 register r12 is unused, since if used as base address + * x86-64 register R12 is unused, since if used as base address * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * r9 caches skb->len - skb->data_len - * r10 caches skb->data, and used for blinding (if enabled) + * R9 caches skb->len - skb->data_len + * R10 caches skb->data, and used for blinding (if enabled) */ static const int reg2hex[] = { - [BPF_REG_0] = 0, /* rax */ - [BPF_REG_1] = 7, /* rdi */ - [BPF_REG_2] = 6, /* rsi */ - [BPF_REG_3] = 2, /* rdx */ - [BPF_REG_4] = 1, /* rcx */ - [BPF_REG_5] = 0, /* r8 */ - [BPF_REG_6] = 3, /* rbx callee saved */ - [BPF_REG_7] = 5, /* r13 callee saved */ - [BPF_REG_8] = 6, /* r14 callee saved */ - [BPF_REG_9] = 7, /* r15 callee saved */ - [BPF_REG_FP] = 5, /* rbp readonly */ - [BPF_REG_AX] = 2, /* r10 temp register */ - [AUX_REG] = 3, /* r11 temp register */ + [BPF_REG_0] = 0, /* RAX */ + [BPF_REG_1] = 7, /* RDI */ + [BPF_REG_2] = 6, /* RSI */ + [BPF_REG_3] = 2, /* RDX */ + [BPF_REG_4] = 1, /* RCX */ + [BPF_REG_5] = 0, /* R8 */ + [BPF_REG_6] = 3, /* RBX callee saved */ + [BPF_REG_7] = 5, /* R13 callee saved */ + [BPF_REG_8] = 6, /* R14 callee saved */ + [BPF_REG_9] = 7, /* R15 callee saved */ + [BPF_REG_FP] = 5, /* RBP readonly */ + [BPF_REG_AX] = 2, /* R10 temp register */ + [AUX_REG] = 3, /* R11 temp register */ }; -/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15 +/* + * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15 * which need extra byte of encoding. * rax,rcx,...,rbp have simpler encoding */ @@ -153,7 +159,7 @@ static bool is_axreg(u32 reg) return reg == BPF_REG_0; } -/* add modifiers if 'reg' maps to x64 registers r8..r15 */ +/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */ static u8 add_1mod(u8 byte, u32 reg) { if (is_ereg(reg)) @@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2) return byte; } -/* encode 'dst_reg' register into x64 opcode 'byte' */ +/* Encode 'dst_reg' register into x86-64 opcode 'byte' */ static u8 add_1reg(u8 byte, u32 dst_reg) { return byte + reg2hex[dst_reg]; } -/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */ +/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) { return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); @@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) static void jit_fill_hole(void *area, unsigned int size) { - /* fill whole space with int3 instructions */ + /* Fill whole space with INT3 instructions */ memset(area, 0xcc, size); } struct jit_context { - int cleanup_addr; /* epilogue code offset */ + int cleanup_addr; /* Epilogue code offset */ bool seen_ld_abs; bool seen_ax_reg; }; -/* maximum number of bytes emitted while JITing one eBPF insn */ +/* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 #define AUX_STACK_SPACE \ - (32 /* space for rbx, r13, r14, r15 */ + \ - 8 /* space for skb_copy_bits() buffer */) + (32 /* Space for RBX, R13, R14, R15 */ + \ + 8 /* Space for skb_copy_bits() buffer */) #define PROLOGUE_SIZE 37 -/* emit x64 prologue code for BPF program and check it's size. +/* + * Emit x86-64 prologue code for BPF program and check its size. * bpf_tail_call helper will skip it while jumping into another program */ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) @@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) u8 *prog = *pprog; int cnt = 0; - EMIT1(0x55); /* push rbp */ - EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ + /* push rbp */ + EMIT1(0x55); + + /* mov rbp,rsp */ + EMIT3(0x48, 0x89, 0xE5); /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ EMIT3_off32(0x48, 0x81, 0xEC, @@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) /* sub rbp, AUX_STACK_SPACE */ EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); - /* all classic BPF filters use R6(rbx) save it */ + /* All classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp+0],rbx */ EMIT4(0x48, 0x89, 0x5D, 0); - /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 - * as temporary, so all tcpdump filters need to spill/fill R7(r13) and - * R8(r14). R9(r15) spill could be made conditional, but there is only + /* + * bpf_convert_filter() maps classic BPF register X to R7 and uses R8 + * as temporary, so all tcpdump filters need to spill/fill R7(R13) and + * R8(R14). R9(R15) spill could be made conditional, but there is only * one 'bpf_error' return path out of helper functions inside bpf_jit.S * The overhead of extra spill is negligible for any filter other * than synthetic ones. Therefore not worth adding complexity. @@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) EMIT4(0x4C, 0x89, 0x7D, 24); if (!ebpf_from_cbpf) { - /* Clear the tail call counter (tail_call_cnt): for eBPF tail + /* + * Clear the tail call counter (tail_call_cnt): for eBPF tail * calls we need to reset the counter to 0. It's done in two - * instructions, resetting rax register to 0, and moving it + * instructions, resetting RAX register to 0, and moving it * to the counter location. */ @@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } -/* generate the following code: +/* + * Generate the following code: + * * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; @@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog) int label1, label2, label3; int cnt = 0; - /* rdi - pointer to ctx + /* + * rdi - pointer to ctx * rsi - pointer to bpf_array * rdx - index in bpf_array */ - /* if (index >= array->map.max_entries) - * goto out; + /* + * if (index >= array->map.max_entries) + * goto out; */ EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */ +#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) - * goto out; + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ @@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - /* if (prog == NULL) - * goto out; + /* + * if (prog == NULL) + * goto out; */ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) @@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog) offsetof(struct bpf_prog, bpf_func)); EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ - /* now we're ready to jump into next BPF program + /* + * Wow we're ready to jump into next BPF program * rdi == ctx (1st arg) * rax == prog->bpf_func + prologue_size */ @@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog) u8 *prog = *pprog; int cnt = 0; - /* r9d = skb->len - skb->data_len (headlen) + /* + * r9d = skb->len - skb->data_len (headlen) * r10 = skb->data */ /* mov %r9d, off32(%rdi) */ @@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u8 b1, b2, b3; int cnt = 0; - /* optimization: if imm32 is positive, use 'mov %eax, imm32' + /* + * Optimization: if imm32 is positive, use 'mov %eax, imm32' * (which zero-extends imm32) to save 2 bytes. */ if (sign_propagate && (s32)imm32 < 0) { @@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, goto done; } - /* optimization: if imm32 is zero, use 'xor %eax, %eax' + /* + * Optimization: if imm32 is zero, use 'xor %eax, %eax' * to save 3 bytes. */ if (imm32 == 0) { @@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { - /* For emitting plain u32, where sign bit must not be + /* + * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 * directly, so save couple of bytes by just doing * 'mov %eax, imm32' instead. @@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - /* b3 holds 'normal' opcode, b2 short form only valid + /* + * b3 holds 'normal' opcode, b2 short form only valid * in case dst is eax/rax. */ switch (BPF_OP(insn->code)) { @@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* mov rax, dst_reg */ EMIT_mov(BPF_REG_0, dst_reg); - /* xor edx, edx + /* + * xor edx, edx * equivalent to 'xor rdx, rdx', but one byte less */ EMIT2(0x31, 0xd2); @@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; } - /* shifts */ + /* Shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_ARSH | BPF_K: @@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: - /* check for bad case when dst_reg == rcx */ + /* Check for bad case when dst_reg == rcx */ if (dst_reg == BPF_REG_4) { /* mov r11, dst_reg */ EMIT_mov(AUX_REG, dst_reg); @@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_END | BPF_FROM_BE: switch (imm32) { case 16: - /* emit 'ror %ax, 8' to swap lower 2 bytes */ + /* Emit 'ror %ax, 8' to swap lower 2 bytes */ EMIT1(0x66); if (is_ereg(dst_reg)) EMIT1(0x41); EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8); - /* emit 'movzwl eax, ax' */ + /* Emit 'movzwl eax, ax' */ if (is_ereg(dst_reg)) EMIT3(0x45, 0x0F, 0xB7); else @@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: - /* emit 'bswap eax' to swap lower 4 bytes */ + /* Emit 'bswap eax' to swap lower 4 bytes */ if (is_ereg(dst_reg)) EMIT2(0x41, 0x0F); else @@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_1reg(0xC8, dst_reg)); break; case 64: - /* emit 'bswap rax' to swap 8 bytes */ + /* Emit 'bswap rax' to swap 8 bytes */ EMIT3(add_1mod(0x48, dst_reg), 0x0F, add_1reg(0xC8, dst_reg)); break; @@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_END | BPF_FROM_LE: switch (imm32) { case 16: - /* emit 'movzwl eax, ax' to zero extend 16-bit + /* + * Emit 'movzwl eax, ax' to zero extend 16-bit * into 64 bit */ if (is_ereg(dst_reg)) @@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: - /* emit 'mov eax, eax' to clear upper 32-bits */ + /* Emit 'mov eax, eax' to clear upper 32-bits */ if (is_ereg(dst_reg)) EMIT1(0x45); EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg)); @@ -809,9 +835,9 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: - /* emit 'mov byte ptr [rax + off], al' */ + /* Emit 'mov byte ptr [rax + off], al' */ if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* have to add extra byte for x86 SIL, DIL regs */ + /* We have to add extra byte for x86 SIL, DIL regs */ src_reg == BPF_REG_1 || src_reg == BPF_REG_2) EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); else @@ -840,25 +866,26 @@ stx: if (is_imm8(insn->off)) /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: - /* emit 'movzx rax, byte ptr [rax + off]' */ + /* Emit 'movzx rax, byte ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); goto ldx; case BPF_LDX | BPF_MEM | BPF_H: - /* emit 'movzx rax, word ptr [rax + off]' */ + /* Emit 'movzx rax, word ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); goto ldx; case BPF_LDX | BPF_MEM | BPF_W: - /* emit 'mov eax, dword ptr [rax+0x14]' */ + /* Emit 'mov eax, dword ptr [rax+0x14]' */ if (is_ereg(dst_reg) || is_ereg(src_reg)) EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); else EMIT1(0x8B); goto ldx; case BPF_LDX | BPF_MEM | BPF_DW: - /* emit 'mov rax, qword ptr [rax+0x14]' */ + /* Emit 'mov rax, qword ptr [rax+0x14]' */ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); -ldx: /* if insn->off == 0 we can save one extra byte, but - * special case of x86 r13 which always needs an offset +ldx: /* + * If insn->off == 0 we can save one extra byte, but + * special case of x86 R13 which always needs an offset * is not worth the hassle */ if (is_imm8(insn->off)) @@ -870,7 +897,7 @@ ldx: /* if insn->off == 0 we can save one extra byte, but /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ case BPF_STX | BPF_XADD | BPF_W: - /* emit 'lock add dword ptr [rax + off], eax' */ + /* Emit 'lock add dword ptr [rax + off], eax' */ if (is_ereg(dst_reg) || is_ereg(src_reg)) EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); else @@ -897,14 +924,15 @@ xadd: if (is_imm8(insn->off)) } else { EMIT2(0x41, 0x52); /* push %r10 */ EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since + /* + * We need to adjust jmp offset, since * pop %r9, pop %r10 take 4 bytes after call insn */ jmp_offset += 4; } } if (!imm32 || !is_simm32(jmp_offset)) { - pr_err("unsupported bpf func %d addr %p image %p\n", + pr_err("unsupported BPF func %d addr %p image %p\n", imm32, func, image); return -EINVAL; } @@ -970,7 +998,7 @@ xadd: if (is_imm8(insn->off)) else EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32); -emit_cond_jmp: /* convert BPF opcode to x86 */ +emit_cond_jmp: /* Convert BPF opcode to x86 */ switch (BPF_OP(insn->code)) { case BPF_JEQ: jmp_cond = X86_JE; @@ -996,22 +1024,22 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ jmp_cond = X86_JBE; break; case BPF_JSGT: - /* signed '>', GT in x86 */ + /* Signed '>', GT in x86 */ jmp_cond = X86_JG; break; case BPF_JSLT: - /* signed '<', LT in x86 */ + /* Signed '<', LT in x86 */ jmp_cond = X86_JL; break; case BPF_JSGE: - /* signed '>=', GE in x86 */ + /* Signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; case BPF_JSLE: - /* signed '<=', LE in x86 */ + /* Signed '<=', LE in x86 */ jmp_cond = X86_JLE; break; - default: /* to silence gcc warning */ + default: /* to silence GCC warning */ return -EFAULT; } jmp_offset = addrs[i + insn->off] - addrs[i]; @@ -1039,7 +1067,7 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ jmp_offset = addrs[i + insn->off] - addrs[i]; if (!jmp_offset) - /* optimize out nop jumps */ + /* Optimize out nop jumps */ break; emit_jmp: if (is_imm8(jmp_offset)) { @@ -1061,7 +1089,7 @@ common_load: ctx->seen_ld_abs = seen_ld_abs = true; jmp_offset = func - (image + addrs[i]); if (!func || !is_simm32(jmp_offset)) { - pr_err("unsupported bpf func %d addr %p image %p\n", + pr_err("unsupported BPF func %d addr %p image %p\n", imm32, func, image); return -EINVAL; } @@ -1080,7 +1108,8 @@ common_load: EMIT2_off32(0x81, 0xC6, imm32); } } - /* skb pointer is in R6 (%rbx), it will be copied into + /* + * skb pointer is in R6 (%rbx), it will be copied into * %rdi if skb_copy_bits() call is necessary. * sk_load_* helpers also use %r10 and %r9d. * See bpf_jit.S @@ -1111,7 +1140,7 @@ common_load: goto emit_jmp; } seen_exit = true; - /* update cleanup_addr */ + /* Update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp+0] */ EMIT4(0x48, 0x8B, 0x5D, 0); @@ -1129,10 +1158,11 @@ common_load: break; default: - /* By design x64 JIT should support all BPF instructions + /* + * By design x86-64 JIT should support all BPF instructions. * This error will be seen if new instruction was added - * to interpreter, but not to JIT - * or if there is junk in bpf_prog + * to the interpreter, but not to the JIT, or if there is + * junk in bpf_prog. */ pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL; @@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) return orig_prog; tmp = bpf_jit_blind_constants(prog); - /* If blinding was requested and we failed during blinding, + /* + * If blinding was requested and we failed during blinding, * we must fall back to the interpreter. */ if (IS_ERR(tmp)) @@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_addrs; } - /* Before first pass, make a rough estimation of addrs[] - * each bpf instruction is translated to less than 64 bytes + /* + * Before first pass, make a rough estimation of addrs[] + * each BPF instruction is translated to less than 64 bytes */ for (proglen = 0, i = 0; i < prog->len; i++) { proglen += 64; @@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.cleanup_addr = proglen; skip_init_addrs: - /* JITed image shrinks with every pass and the loop iterates - * until the image stops shrinking. Very large bpf programs + /* + * JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large BPF programs * may converge on the last pass. In such case do one more - * pass to emit the final image + * pass to emit the final image. */ for (pass = 0; pass < 20 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index bed7e7f4e44c..e01f7ceb9e7a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -225,7 +225,7 @@ int __init efi_alloc_page_tables(void) pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { - if (pgtable_l5_enabled) + if (pgtable_l5_enabled()) free_page((unsigned long) pgd_page_vaddr(*pgd)); free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); return -ENOMEM; diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index 58024862a7eb..a52914aa3b6c 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -57,7 +57,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg) } EXPORT_SYMBOL_GPL(vrtc_cmos_write); -void vrtc_get_time(struct timespec *now) +void vrtc_get_time(struct timespec64 *now) { u8 sec, min, hour, mday, mon; unsigned long flags; @@ -83,18 +83,18 @@ void vrtc_get_time(struct timespec *now) pr_info("vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); - now->tv_sec = mktime(year, mon, mday, hour, min, sec); + now->tv_sec = mktime64(year, mon, mday, hour, min, sec); now->tv_nsec = 0; } -int vrtc_set_mmss(const struct timespec *now) +int vrtc_set_mmss(const struct timespec64 *now) { unsigned long flags; struct rtc_time tm; int year; int retval = 0; - rtc_time_to_tm(now->tv_sec, &tm); + rtc_time64_to_tm(now->tv_sec, &tm); if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) { /* * tm.year is the number of years since 1900, and the @@ -110,8 +110,8 @@ int vrtc_set_mmss(const struct timespec *now) vrtc_cmos_write(tm.tm_sec, RTC_SECONDS); spin_unlock_irqrestore(&rtc_lock, flags); } else { - pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n", - __func__, now->tv_sec); + pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n", + __func__, (s64)now->tv_sec); retval = -EINVAL; } return retval; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ccf4a49bb065..67ccf64c8bd8 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -72,7 +72,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * tables used by the image kernel. */ - if (pgtable_l5_enabled) { + if (pgtable_l5_enabled()) { p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); if (!p4d) return -ENOMEM; diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 10003359e633..b2d6967262b2 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -23,14 +23,14 @@ $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) -export CPPFLAGS_vdso.lds += -P -C +CPPFLAGS_vdso.lds += -P -C VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) $(obj)/%.so: OBJCOPYFLAGS := -S diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index a18703be9ead..1804b27f9632 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -115,6 +115,61 @@ static efi_system_table_t __init *xen_efi_probe(void) return &efi_systab_xen; } +/* + * Determine whether we're in secure boot mode. + * + * Please keep the logic in sync with + * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot(). + */ +static enum efi_secureboot_mode xen_efi_get_secureboot(void) +{ + static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; + static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; + efi_status_t status; + u8 moksbstate, secboot, setupmode; + unsigned long size; + + size = sizeof(secboot); + status = efi.get_variable(L"SecureBoot", &efi_variable_guid, + NULL, &size, &secboot); + + if (status == EFI_NOT_FOUND) + return efi_secureboot_mode_disabled; + + if (status != EFI_SUCCESS) + goto out_efi_err; + + size = sizeof(setupmode); + status = efi.get_variable(L"SetupMode", &efi_variable_guid, + NULL, &size, &setupmode); + + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (secboot == 0 || setupmode == 1) + return efi_secureboot_mode_disabled; + + /* See if a user has put the shim into insecure mode. */ + size = sizeof(moksbstate); + status = efi.get_variable(L"MokSBStateRT", &shim_guid, + NULL, &size, &moksbstate); + + /* If it fails, we don't care why. Default to secure. */ + if (status != EFI_SUCCESS) + goto secure_boot_enabled; + + if (moksbstate == 1) + return efi_secureboot_mode_disabled; + + secure_boot_enabled: + pr_info("UEFI Secure Boot is enabled.\n"); + return efi_secureboot_mode_enabled; + + out_efi_err: + pr_err("Could not determine UEFI Secure Boot status.\n"); + return efi_secureboot_mode_unknown; +} + void __init xen_efi_init(void) { efi_system_table_t *efi_systab_xen; @@ -129,6 +184,8 @@ void __init xen_efi_init(void) boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen); boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); + boot_params.secure_boot = xen_efi_get_secureboot(); + set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_PARAVIRT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 29163c43ebbd..e0f1bcf01d63 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -57,7 +57,7 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } -static void xen_read_wallclock(struct timespec *ts) +static void xen_read_wallclock(struct timespec64 *ts) { struct shared_info *s = HYPERVISOR_shared_info; struct pvclock_wall_clock *wall_clock = &(s->wc); @@ -68,12 +68,12 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -static void xen_get_wallclock(struct timespec *now) +static void xen_get_wallclock(struct timespec64 *now) { xen_read_wallclock(now); } -static int xen_set_wallclock(const struct timespec *now) +static int xen_set_wallclock(const struct timespec64 *now) { return -ENODEV; } @@ -461,7 +461,7 @@ static void __init xen_time_init(void) { struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); - struct timespec tp; + struct timespec64 tp; /* As Dom0 is never moved, no penalty on using TSC there */ if (xen_initial_domain()) @@ -479,7 +479,7 @@ static void __init xen_time_init(void) /* Set initial system time with full resolution */ xen_read_wallclock(&tp); - do_settimeofday(&tp); + do_settimeofday64(&tp); setup_force_cpu_cap(X86_FEATURE_TSC); |