diff options
Diffstat (limited to 'arch/x86')
311 files changed, 18040 insertions, 12581 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7101ac64bb20..f6946b81f74a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -75,7 +75,7 @@ config X86 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE + select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_STRICT_KERNEL_RWX @@ -215,6 +215,8 @@ config X86 select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 + select HAVE_STATIC_CALL + select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -230,6 +232,7 @@ config X86 select RTC_MC146818_LIB select SPARSE_IRQ select SRCU + select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT @@ -451,7 +454,6 @@ config GOLDFISH config RETPOLINE bool "Avoid speculative indirect branches in kernel" default y - select STACK_VALIDATION if HAVE_STACK_VALIDATION help Compile kernel with the retpoline compiler options to guard against kernel-to-user data leaks by avoiding speculative indirect @@ -1521,6 +1523,7 @@ config AMD_MEM_ENCRYPT select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select INSTRUCTION_DECODER help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1968,22 +1971,6 @@ config EFI_MIXED If unsure, say N. -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - source "kernel/Kconfig.hz" config KEXEC diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ee1d3c5834c6..27b5e2bc6a01 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -62,7 +62,7 @@ config EARLY_PRINTK_USB_XDBC You should normally say N here, unless you want to debug early crashes or need a very simple printk logging facility. -config MCSAFE_TEST +config COPY_MC_TEST def_bool n config EFI_PGT_DUMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4346ffb2e39f..154259f18b8b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -209,6 +209,10 @@ ifdef CONFIG_X86_64 LDFLAGS_vmlinux += -z max-page-size=0x200000 endif +# We never want expected sections to be placed heuristically by the +# linker. All sections should be explicitly named in the linker script. +LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3962f592633d..ee249088cbfe 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -29,10 +29,10 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst KBUILD_CFLAGS := -m$(BITS) -O2 -KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) +KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding @@ -43,24 +43,26 @@ KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS +# Disable relocation relaxation in case the link is not PIE. +KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h + +# sev-es.c indirectly inludes inat-table.h which is generated during +# compilation and stored in $(objtree). Add the directory to the includes so +# that the compiler finds it even with out-of-tree builds (make O=/some/path). +CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. -ifeq ($(CONFIG_X86_32),y) -KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) -else -# To build 64-bit compressed kernel as PIE, we disable relocation -# overflow check to avoid relocation overflow error with a new linker -# command-line option, -z noreloc-overflow. -KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ - && echo "-z noreloc-overflow -pie --no-dynamic-linker") -endif -LDFLAGS_vmlinux := -T +LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) +LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) +LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include @@ -84,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o + vmlinux-objs-y += $(obj)/ident_map_64.o + vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o @@ -94,30 +98,8 @@ vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a -# The compressed kernel is built with -fPIC/-fPIE so that a boot loader -# can place it anywhere in memory and it will still run. However, since -# it is executed as-is without any ELF relocation processing performed -# (and has already had all relocation sections stripped from the binary), -# none of the code can use data relocations (e.g. static assignments of -# pointer values), since they will be meaningless at runtime. This check -# will refuse to link the vmlinux if any of these relocations are found. -quiet_cmd_check_data_rel = DATAREL $@ -define cmd_check_data_rel - for obj in $(filter %.o,$^); do \ - $(READELF) -S $$obj | grep -qF .rel.local && { \ - echo "error: $$obj has data relocations!" >&2; \ - exit 1; \ - } || true; \ - done -endef - -# We need to run two commands under "if_changed", so merge them into a -# single invocation. -quiet_cmd_check-and-link-vmlinux = LD $@ - cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) - $(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE - $(call if_changed,check-and-link-vmlinux) + $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index 6448a8196d32..0cc1323896d1 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_RANDOMIZE_BASE - #include "../cpuflags.c" bool has_cpuflag(int flag) @@ -9,5 +7,3 @@ bool has_cpuflag(int flag) return test_bit(flag, cpu.flags); } - -#endif diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 03557f2174bf..659fad53ca82 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -33,32 +33,13 @@ #include <asm/bootparam.h> /* - * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X - * relocation to get the symbol address in PIC. When the compressed x86 - * kernel isn't built as PIC, the linker optimizes R_386_GOT32X - * relocations to their fixed symbol addresses. However, when the - * compressed x86 kernel is loaded at a different address, it leads - * to the following load failure: - * - * Failed to allocate space for phdrs - * - * during the decompression stage. - * - * If the compressed x86 kernel is relocatable at run-time, it should be - * compiled with -fPIE, instead of -fPIC, if possible and should be built as - * Position Independent Executable (PIE) so that linker won't optimize - * R_386_GOT32X relocation to its fixed symbol address. Older - * linkers generate R_386_32 relocations against locally defined symbols, - * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less - * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle - * R_386_32 relocations when relocating the kernel. To generate - * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as - * hidden: + * These symbols needed to be marked as .hidden to prevent the BFD linker from + * generating R_386_32 (rather than R_386_RELATIVE) relocations for them when + * the 32-bit compressed kernel is linked as PIE. This is no longer necessary, + * but it doesn't hurt to keep them .hidden. */ .hidden _bss .hidden _ebss - .hidden _got - .hidden _egot .hidden _end __HEAD @@ -77,10 +58,10 @@ SYM_FUNC_START(startup_32) leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %edx - subl $1b, %edx + addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %edx /* Load new GDT */ - leal gdt(%edx), %eax + leal gdt@GOTOFF(%edx), %eax movl %eax, 2(%eax) lgdt (%eax) @@ -93,14 +74,16 @@ SYM_FUNC_START(startup_32) movl %eax, %ss /* - * %edx contains the address we are loaded at by the boot loader and %ebx - * contains the address where we should move the kernel image temporarily - * for safe in-place decompression. %ebp contains the address that the kernel - * will be decompressed to. + * %edx contains the address we are loaded at by the boot loader (plus the + * offset to the GOT). The below code calculates %ebx to be the address where + * we should move the kernel image temporarily for safe in-place decompression + * (again, plus the offset to the GOT). + * + * %ebp is calculated to be the address that the kernel will be decompressed to. */ #ifdef CONFIG_RELOCATABLE - movl %edx, %ebx + leal startup_32@GOTOFF(%edx), %ebx #ifdef CONFIG_EFI_STUB /* @@ -111,7 +94,7 @@ SYM_FUNC_START(startup_32) * image_offset = startup_32 - image_base * Otherwise image_offset will be zero and has no effect on the calculations. */ - subl image_offset(%edx), %ebx + subl image_offset@GOTOFF(%edx), %ebx #endif movl BP_kernel_alignment(%esi), %eax @@ -128,10 +111,10 @@ SYM_FUNC_START(startup_32) movl %ebx, %ebp // Save the output address for later /* Target address to relocate to for decompression */ addl BP_init_size(%esi), %ebx - subl $_end, %ebx + subl $_end@GOTOFF, %ebx /* Set up the stack */ - leal boot_stack_end(%ebx), %esp + leal boot_stack_end@GOTOFF(%ebx), %esp /* Zero EFLAGS */ pushl $0 @@ -142,8 +125,8 @@ SYM_FUNC_START(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal (_bss-4)(%edx), %esi - leal (_bss-4)(%ebx), %edi + leal (_bss@GOTOFF-4)(%edx), %esi + leal (_bss@GOTOFF-4)(%ebx), %edi movl $(_bss - startup_32), %ecx shrl $2, %ecx std @@ -156,14 +139,14 @@ SYM_FUNC_START(startup_32) * during extract_kernel below. To avoid any issues, repoint the GDTR * to the new copy of the GDT. */ - leal gdt(%ebx), %eax + leal gdt@GOTOFF(%ebx), %eax movl %eax, 2(%eax) lgdt (%eax) /* * Jump to the relocated address. */ - leal .Lrelocated(%ebx), %eax + leal .Lrelocated@GOTOFF(%ebx), %eax jmp *%eax SYM_FUNC_END(startup_32) @@ -173,7 +156,7 @@ SYM_FUNC_START_ALIAS(efi_stub_entry) add $0x4, %esp movl 8(%esp), %esi /* save boot_params pointer */ call efi_main - leal startup_32(%eax), %eax + /* efi_main returns the possibly relocated address of startup_32 */ jmp *%eax SYM_FUNC_END(efi32_stub_entry) SYM_FUNC_END_ALIAS(efi_stub_entry) @@ -186,40 +169,26 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) * Clear BSS (stack is currently empty) */ xorl %eax, %eax - leal _bss(%ebx), %edi - leal _ebss(%ebx), %ecx + leal _bss@GOTOFF(%ebx), %edi + leal _ebss@GOTOFF(%ebx), %ecx subl %edi, %ecx shrl $2, %ecx rep stosl /* - * Adjust our own GOT - */ - leal _got(%ebx), %edx - leal _egot(%ebx), %ecx -1: - cmpl %ecx, %edx - jae 2f - addl %ebx, (%edx) - addl $4, %edx - jmp 1b -2: - -/* * Do the extraction, and jump to the new kernel.. */ - /* push arguments for extract_kernel: */ - pushl $z_output_len /* decompressed length, end of relocs */ - - pushl %ebp /* output address */ - - pushl $z_input_len /* input_len */ - leal input_data(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap(%ebx), %eax - pushl %eax /* heap area */ - pushl %esi /* real mode pointer */ - call extract_kernel /* returns kernel location in %eax */ + /* push arguments for extract_kernel: */ + + pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */ + pushl %ebp /* output address */ + pushl input_len@GOTOFF(%ebx) /* input_len */ + leal input_data@GOTOFF(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap@GOTOFF(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call extract_kernel /* returns kernel location in %eax */ addl $24, %esp /* diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 97d37f0a34f5..017de6cc87dc 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include <asm/desc_defs.h> #include "pgtable.h" /* @@ -40,11 +41,35 @@ */ .hidden _bss .hidden _ebss - .hidden _got - .hidden _egot .hidden _end __HEAD + +/* + * This macro gives the relative virtual address of X, i.e. the offset of X + * from startup_32. This is the same as the link-time virtual address of X, + * since startup_32 is at 0, but defining it this way tells the + * assembler/linker that we do not want the actual run-time address of X. This + * prevents the linker from trying to create unwanted run-time relocation + * entries for the reference when the compressed kernel is linked as PIE. + * + * A reference X(%reg) will result in the link-time VA of X being stored with + * the instruction, and a run-time R_X86_64_RELATIVE relocation entry that + * adds the 64-bit base address where the kernel is loaded. + * + * Replacing it with (X-startup_32)(%reg) results in the offset being stored, + * and no run-time relocation. + * + * The macro should be used as a displacement with a base register containing + * the run-time address of startup_32 [i.e. rva(X)(%reg)], or as an immediate + * [$ rva(X)]. + * + * This macro can only be used from within the .head.text section, since the + * expression requires startup_32 to be in the same section as the code being + * assembled. + */ +#define rva(X) ((X) - startup_32) + .code32 SYM_FUNC_START(startup_32) /* @@ -67,10 +92,10 @@ SYM_FUNC_START(startup_32) leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp - subl $1b, %ebp + subl $ rva(1b), %ebp /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax + leal rva(gdt)(%ebp), %eax movl %eax, 2(%eax) lgdt (%eax) @@ -83,7 +108,7 @@ SYM_FUNC_START(startup_32) movl %eax, %ss /* setup a stack and make sure cpu supports long mode. */ - leal boot_stack_end(%ebp), %esp + leal rva(boot_stack_end)(%ebp), %esp call verify_cpu testl %eax, %eax @@ -110,7 +135,7 @@ SYM_FUNC_START(startup_32) * image_offset = startup_32 - image_base * Otherwise image_offset will be zero and has no effect on the calculations. */ - subl image_offset(%ebp), %ebx + subl rva(image_offset)(%ebp), %ebx #endif movl BP_kernel_alignment(%esi), %eax @@ -126,7 +151,7 @@ SYM_FUNC_START(startup_32) /* Target address to relocate to for decompression */ addl BP_init_size(%esi), %ebx - subl $_end, %ebx + subl $ rva(_end), %ebx /* * Prepare for entering 64 bit mode @@ -154,19 +179,19 @@ SYM_FUNC_START(startup_32) 1: /* Initialize Page tables to 0 */ - leal pgtable(%ebx), %edi + leal rva(pgtable)(%ebx), %edi xorl %eax, %eax movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ - leal pgtable + 0(%ebx), %edi + leal rva(pgtable + 0)(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) addl %edx, 4(%edi) /* Build Level 3 */ - leal pgtable + 0x1000(%ebx), %edi + leal rva(pgtable + 0x1000)(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) @@ -177,7 +202,7 @@ SYM_FUNC_START(startup_32) jnz 1b /* Build Level 2 */ - leal pgtable + 0x2000(%ebx), %edi + leal rva(pgtable + 0x2000)(%ebx), %edi movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) @@ -188,7 +213,7 @@ SYM_FUNC_START(startup_32) jnz 1b /* Enable the boot page tables */ - leal pgtable(%ebx), %eax + leal rva(pgtable)(%ebx), %eax movl %eax, %cr3 /* Enable Long mode in EFER (Extended Feature Enable Register) */ @@ -213,14 +238,14 @@ SYM_FUNC_START(startup_32) * We place all of the values on our mini stack so lret can * used to perform that far jump. */ - leal startup_64(%ebp), %eax + leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED - movl efi32_boot_args(%ebp), %edi + movl rva(efi32_boot_args)(%ebp), %edi cmp $0, %edi jz 1f - leal efi64_stub_entry(%ebp), %eax - movl efi32_boot_args+4(%ebp), %esi - movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer + leal rva(efi64_stub_entry)(%ebp), %eax + movl rva(efi32_boot_args+4)(%ebp), %esi + movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer cmpl $0, %edx jnz 1f /* @@ -231,7 +256,7 @@ SYM_FUNC_START(startup_32) * the correct stack alignment for entry. */ subl $40, %esp - leal efi_pe_entry(%ebp), %eax + leal rva(efi_pe_entry)(%ebp), %eax movl %edi, %ecx // MS calling convention movl %esi, %edx 1: @@ -257,18 +282,18 @@ SYM_FUNC_START(efi32_stub_entry) call 1f 1: pop %ebp - subl $1b, %ebp + subl $ rva(1b), %ebp - movl %esi, efi32_boot_args+8(%ebp) + movl %esi, rva(efi32_boot_args+8)(%ebp) SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) - movl %ecx, efi32_boot_args(%ebp) - movl %edx, efi32_boot_args+4(%ebp) - movb $0, efi_is64(%ebp) + movl %ecx, rva(efi32_boot_args)(%ebp) + movl %edx, rva(efi32_boot_args+4)(%ebp) + movb $0, rva(efi_is64)(%ebp) /* Save firmware GDTR and code/data selectors */ - sgdtl efi32_boot_gdt(%ebp) - movw %cs, efi32_boot_cs(%ebp) - movw %ds, efi32_boot_ds(%ebp) + sgdtl rva(efi32_boot_gdt)(%ebp) + movw %cs, rva(efi32_boot_cs)(%ebp) + movw %ds, rva(efi32_boot_ds)(%ebp) /* Disable paging */ movl %cr0, %eax @@ -347,30 +372,11 @@ SYM_CODE_START(startup_64) /* Target address to relocate to for decompression */ movl BP_init_size(%rsi), %ebx - subl $_end, %ebx + subl $ rva(_end), %ebx addq %rbp, %rbx /* Set up the stack */ - leaq boot_stack_end(%rbx), %rsp - - /* - * paging_prepare() and cleanup_trampoline() below can have GOT - * references. Adjust the table with address we are running at. - * - * Zero RAX for adjust_got: the GOT was not adjusted before; - * there's no adjustment to undo. - */ - xorq %rax, %rax - - /* - * Calculate the address the binary is loaded at and use it as - * a GOT adjustment. - */ - call 1f -1: popq %rdi - subq $1b, %rdi - - call .Ladjust_got + leaq rva(boot_stack_end)(%rbx), %rsp /* * At this point we are in long mode with 4-level paging enabled, @@ -410,6 +416,10 @@ SYM_CODE_START(startup_64) .Lon_kernel_cs: + pushq %rsi + call load_stage1_idt + popq %rsi + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -444,7 +454,7 @@ SYM_CODE_START(startup_64) lretq trampoline_return: /* Restore the stack, the 32-bit trampoline uses its own stack */ - leaq boot_stack_end(%rbx), %rsp + leaq rva(boot_stack_end)(%rbx), %rsp /* * cleanup_trampoline() would restore trampoline memory. @@ -456,7 +466,7 @@ trampoline_return: * this function call. */ pushq %rsi - leaq top_pgtable(%rbx), %rdi + leaq rva(top_pgtable)(%rbx), %rdi call cleanup_trampoline popq %rsi @@ -464,30 +474,15 @@ trampoline_return: pushq $0 popfq - /* - * Previously we've adjusted the GOT with address the binary was - * loaded at. Now we need to re-adjust for relocation address. - * - * Calculate the address the binary is loaded at, so that we can - * undo the previous GOT adjustment. - */ - call 1f -1: popq %rax - subq $1b, %rax - - /* The new adjustment is the relocation address */ - movq %rbx, %rdi - call .Ladjust_got - /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ pushq %rsi leaq (_bss-8)(%rip), %rsi - leaq (_bss-8)(%rbx), %rdi - movq $_bss /* - $startup_32 */, %rcx - shrq $3, %rcx + leaq rva(_bss-8)(%rbx), %rdi + movl $(_bss - startup_32), %ecx + shrl $3, %ecx std rep movsq cld @@ -498,15 +493,15 @@ trampoline_return: * during extract_kernel below. To avoid any issues, repoint the GDTR * to the new copy of the GDT. */ - leaq gdt64(%rbx), %rax - leaq gdt(%rbx), %rdx + leaq rva(gdt64)(%rbx), %rax + leaq rva(gdt)(%rbx), %rdx movq %rdx, 2(%rax) lgdt (%rax) /* * Jump to the relocated address. */ - leaq .Lrelocated(%rbx), %rax + leaq rva(.Lrelocated)(%rbx), %rax jmp *%rax SYM_CODE_END(startup_64) @@ -518,7 +513,7 @@ SYM_FUNC_START_ALIAS(efi_stub_entry) movq %rdx, %rbx /* save boot_params pointer */ call efi_main movq %rbx,%rsi - leaq startup_64(%rax), %rax + leaq rva(startup_64)(%rax), %rax jmp *%rax SYM_FUNC_END(efi64_stub_entry) SYM_FUNC_END_ALIAS(efi_stub_entry) @@ -538,15 +533,33 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) rep stosq /* + * If running as an SEV guest, the encryption mask is required in the + * page-table setup code below. When the guest also has SEV-ES enabled + * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 + * handler can't map its GHCB because the page-table is not set up yet. + * So set up the encryption mask here while still on the stage1 #VC + * handler. Then load stage2 IDT and switch to the kernel's own + * page-table. + */ + pushq %rsi + call set_sev_encryption_mask + call load_stage2_idt + + /* Pass boot_params to initialize_identity_maps() */ + movq (%rsp), %rdi + call initialize_identity_maps + popq %rsi + +/* * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ - movl $z_input_len, %ecx /* input_len */ + movl input_len(%rip), %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movl $z_output_len, %r9d /* decompressed length, end of relocs */ + movl output_len(%rip), %r9d /* decompressed length, end of relocs */ call extract_kernel /* returns kernel location in %rax */ popq %rsi @@ -556,27 +569,6 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%rax SYM_FUNC_END(.Lrelocated) -/* - * Adjust the global offset table - * - * RAX is the previous adjustment of the table to undo (use 0 if it's the - * first time we touch GOT). - * RDI is the new adjustment to apply. - */ -.Ladjust_got: - /* Walk through the GOT adding the address to the entries */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - subq %rax, (%rdx) /* Undo previous adjustment */ - addq %rdi, (%rdx) /* Apply the new adjustment */ - addq $8, %rdx - jmp 1b -2: - ret - .code32 /* * This is the 32-bit trampoline that will be copied over to low memory. @@ -690,10 +682,21 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x0000000000000000 /* TS continued */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) +SYM_DATA_START(boot_idt_desc) + .word boot_idt_end - boot_idt - 1 + .quad 0 +SYM_DATA_END(boot_idt_desc) + .balign 8 +SYM_DATA_START(boot_idt) + .rept BOOT_IDT_ENTRIES + .quad 0 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif - #ifdef CONFIG_EFI_MIXED SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) @@ -702,7 +705,7 @@ SYM_DATA(efi_is64, .byte 1) #define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) #define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - .text + __HEAD .code32 SYM_FUNC_START(efi32_pe_entry) /* @@ -724,12 +727,12 @@ SYM_FUNC_START(efi32_pe_entry) call 1f 1: pop %ebx - subl $1b, %ebx + subl $ rva(1b), %ebx /* Get the loaded image protocol pointer from the image handle */ leal -4(%ebp), %eax pushl %eax // &loaded_image - leal loaded_image_proto(%ebx), %eax + leal rva(loaded_image_proto)(%ebx), %eax pushl %eax // pass the GUID address pushl 8(%ebp) // pass the image handle @@ -764,7 +767,7 @@ SYM_FUNC_START(efi32_pe_entry) * use it before we get to the 64-bit efi_pe_entry() in C code. */ subl %esi, %ebx - movl %ebx, image_offset(%ebp) // save image_offset + movl %ebx, rva(image_offset)(%ebp) // save image_offset jmp efi32_pe_stub_entry 2: popl %edi // restore callee-save registers diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c new file mode 100644 index 000000000000..a5e5db6ada3c --- /dev/null +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + +#include "error.h" +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include <linux/pgtable.h> +#include <asm/cmpxchg.h> +#include <asm/trap_pf.h> +#include <asm/trapnr.h> +#include <asm/init.h> +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP + +extern unsigned long get_cmd_line_ptr(void); + +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* + * Adds the specified range to the identity mappings. + */ +static void add_identity_map(unsigned long start, unsigned long end) +{ + int ret; + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); + if (ret) + error("Error: kernel_ident_mapping_init() failed\n"); +} + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void *rmode) +{ + unsigned long cmdline; + + /* Exclude the encryption mask from __PHYSICAL_MASK */ + physical_mask &= ~sme_me_mask; + + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; + + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. + */ + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); + } + + /* + * New page-table is set up - map the kernel image, boot_params and the + * command line. The uncompressed kernel requires boot_params and the + * command line to be mapped in the identity mapping. Map them + * explicitly here in case the compressed kernel does not touch them, + * or does not touch all the pages covering them. + */ + add_identity_map((unsigned long)_head, (unsigned long)_end); + boot_params = rmode; + add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + cmdline = get_cmd_line_ptr(); + add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + + /* Load the new page-table. */ + write_cr3(top_level_pgt); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(top_level_pgt); +} + +static pte_t *split_large_pmd(struct x86_mapping_info *info, + pmd_t *pmdp, unsigned long __address) +{ + unsigned long page_flags; + unsigned long address; + pte_t *pte; + pmd_t pmd; + int i; + + pte = (pte_t *)info->alloc_pgt_page(info->context); + if (!pte) + return NULL; + + address = __address & PMD_MASK; + /* No large page - clear PSE flag */ + page_flags = info->page_flag & ~_PAGE_PSE; + + /* Populate the PTEs */ + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pte(&pte[i], __pte(address | page_flags)); + address += PAGE_SIZE; + } + + /* + * Ideally we need to clear the large PMD first and do a TLB + * flush before we write the new PMD. But the 2M range of the + * PMD might contain the code we execute and/or the stack + * we are on, so we can't do that. But that should be safe here + * because we are going from large to small mappings and we are + * also the only user of the page-table, so there is no chance + * of a TLB multihit. + */ + pmd = __pmd((unsigned long)pte | info->kernpg_flag); + set_pmd(pmdp, pmd); + /* Flush TLB to establish the new PMD */ + write_cr3(top_level_pgt); + + return pte + pte_index(__address); +} + +static void clflush_page(unsigned long address) +{ + unsigned int flush_size; + char *cl, *start, *end; + + /* + * Hardcode cl-size to 64 - CPUID can't be used here because that might + * cause another #VC exception and the GHCB is not ready to use yet. + */ + flush_size = 64; + start = (char *)(address & PAGE_MASK); + end = start + PAGE_SIZE; + + /* + * First make sure there are no pending writes on the cache-lines to + * flush. + */ + asm volatile("mfence" : : : "memory"); + + for (cl = start; cl != end; cl += flush_size) + clflush(cl); +} + +static int set_clr_page_flags(struct x86_mapping_info *info, + unsigned long address, + pteval_t set, pteval_t clr) +{ + pgd_t *pgdp = (pgd_t *)top_level_pgt; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep, pte; + + /* + * First make sure there is a PMD mapping for 'address'. + * It should already exist, but keep things generic. + * + * To map the page just read from it and fault it in if there is no + * mapping yet. add_identity_map() can't be called here because that + * would unconditionally map the address on PMD level, destroying any + * PTE-level mappings that might already exist. Use assembly here so + * the access won't be optimized away. + */ + asm volatile("mov %[address], %%r9" + :: [address] "g" (*(unsigned long *)address) + : "r9", "memory"); + + /* + * The page is mapped at least with PMD size - so skip checks and walk + * directly to the PMD. + */ + p4dp = p4d_offset(pgdp, address); + pudp = pud_offset(p4dp, address); + pmdp = pmd_offset(pudp, address); + + if (pmd_large(*pmdp)) + ptep = split_large_pmd(info, pmdp, address); + else + ptep = pte_offset_kernel(pmdp, address); + + if (!ptep) + return -ENOMEM; + + /* + * Changing encryption attributes of a page requires to flush it from + * the caches. + */ + if ((set | clr) & _PAGE_ENC) + clflush_page(address); + + /* Update PTE */ + pte = *ptep; + pte = pte_set_flags(pte, set); + pte = pte_clear_flags(pte, clr); + set_pte(ptep, pte); + + /* Flush TLB after changing encryption attribute */ + write_cr3(top_level_pgt); + + return 0; +} + +int set_page_decrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); +} + +int set_page_encrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); +} + +int set_page_non_present(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); +} + +static void do_pf_error(const char *msg, unsigned long error_code, + unsigned long address, unsigned long ip) +{ + error_putstr(msg); + + error_putstr("\nError Code: "); + error_puthex(error_code); + error_putstr("\nCR2: 0x"); + error_puthex(address); + error_putstr("\nRIP relative to _head: 0x"); + error_puthex(ip - (unsigned long)_head); + error_putstr("\n"); + + error("Stopping.\n"); +} + +void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = native_read_cr2(); + unsigned long end; + bool ghcb_fault; + + ghcb_fault = sev_es_check_ghcb_fault(address); + + address &= PMD_MASK; + end = address + PMD_SIZE; + + /* + * Check for unexpected error codes. Unexpected are: + * - Faults on present pages + * - User faults + * - Reserved bits set + */ + if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) + do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); + else if (ghcb_fault) + do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); + + /* + * Error code is sane - now identity map the 2M region around + * the faulting address. + */ + add_identity_map(address, end); +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c new file mode 100644 index 000000000000..804a502ee0d2 --- /dev/null +++ b/arch/x86/boot/compressed/idt_64.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/trap_pf.h> +#include <asm/segment.h> +#include <asm/trapnr.h> +#include "misc.h" + +static void set_idt_entry(int vector, void (*handler)(void)) +{ + unsigned long address = (unsigned long)handler; + gate_desc entry; + + memset(&entry, 0, sizeof(entry)); + + entry.offset_low = (u16)(address & 0xffff); + entry.segment = __KERNEL_CS; + entry.bits.type = GATE_TRAP; + entry.bits.p = 1; + entry.offset_middle = (u16)((address >> 16) & 0xffff); + entry.offset_high = (u32)(address >> 32); + + memcpy(&boot_idt[vector], &entry, sizeof(entry)); +} + +/* Have this here so we don't need to include <asm/desc.h> */ +static void load_boot_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +/* Setup IDT before kernel jumping to .Lrelocated */ +void load_stage1_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_idt_entry(X86_TRAP_VC, boot_stage1_vc); + + load_boot_idt(&boot_idt_desc); +} + +/* Setup IDT after kernel jumping to .Lrelocated */ +void load_stage2_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + set_idt_entry(X86_TRAP_PF, boot_page_fault); + +#ifdef CONFIG_AMD_MEM_ENCRYPT + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); +#endif + + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S new file mode 100644 index 000000000000..22890e199f5b --- /dev/null +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Early IDT handler entry points + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#include <asm/segment.h> + +/* For ORIG_RAX */ +#include "../../entry/calling.h" + +.macro EXCEPTION_HANDLER name function error_code=0 +SYM_FUNC_START(\name) + + /* Build pt_regs */ + .if \error_code == 0 + pushq $0 + .endif + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* Call handler with pt_regs */ + movq %rsp, %rdi + /* Error code is second parameter */ + movq ORIG_RAX(%rsp), %rsi + call \function + + /* Restore regs */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + + /* Remove error code and return */ + addq $8, %rsp + + iretq +SYM_FUNC_END(\name) + .endm + + .text + .code64 + +EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 + +#ifdef CONFIG_AMD_MEM_ENCRYPT +EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 +EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index dde7cb3724df..b92fffbe761f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -36,17 +36,12 @@ #define STATIC #include <linux/decompress/mm.h> -#ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled; -unsigned int pgdir_shift __ro_after_init = 39; -unsigned int ptrs_per_p4d __ro_after_init = 1; -#endif +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP extern unsigned long get_cmd_line_ptr(void); -/* Used by PAGE_KERN* macros: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; - /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -87,8 +82,11 @@ static unsigned long get_boot_seed(void) static bool memmap_too_large; -/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -static unsigned long long mem_limit = ULLONG_MAX; +/* + * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. + * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. + */ +static u64 mem_limit; /* Number of immovable memory regions */ static int num_immovable_mem; @@ -131,8 +129,7 @@ enum parse_mode { }; static int -parse_memmap(char *p, unsigned long long *start, unsigned long long *size, - enum parse_mode mode) +parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) { char *oldp; @@ -162,7 +159,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size, */ *size = 0; } else { - unsigned long long flags; + u64 flags; /* * efi_fake_mem=nn@ss:attr the attr specifies @@ -201,7 +198,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; - unsigned long long start, size; + u64 start, size; char *k = strchr(str, ','); if (k) @@ -214,7 +211,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) if (start == 0) { /* Store the specified memory limit if size > 0 */ - if (size > 0) + if (size > 0 && size < mem_limit) mem_limit = size; continue; @@ -261,15 +258,15 @@ static void parse_gb_huge_pages(char *param, char *val) static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); - size_t len = strlen((char *)args); + size_t len; char *tmp_cmdline; char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=") && - !strstr(args, "hugepages")) + if (!args) return; + len = strnlen(args, COMMAND_LINE_SIZE-1); tmp_cmdline = malloc(len + 1); if (!tmp_cmdline) error("Failed to allocate space for tmp_cmdline"); @@ -284,14 +281,12 @@ static void handle_mem_options(void) while (*args) { args = next_arg(args, ¶m, &val); /* Stop at -- */ - if (!val && strcmp(param, "--") == 0) { - warn("Only '--' specified in cmdline"); - goto out; - } + if (!val && strcmp(param, "--") == 0) + break; if (!strcmp(param, "memmap")) { mem_avoid_memmap(PARSE_MEMMAP, val); - } else if (strstr(param, "hugepages")) { + } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; @@ -300,21 +295,23 @@ static void handle_mem_options(void) continue; mem_size = memparse(p, &p); if (mem_size == 0) - goto out; + break; - mem_limit = mem_size; + if (mem_size < mem_limit) + mem_limit = mem_size; } else if (!strcmp(param, "efi_fake_mem")) { mem_avoid_memmap(PARSE_EFI, val); } } -out: free(tmp_cmdline); return; } /* - * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) + * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. + * * The mem_avoid array is used to store the ranges that need to be avoided * when KASLR searches for an appropriate random address. We must avoid any * regions that are unsafe to overlap with during decompression, and other @@ -392,8 +389,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, { unsigned long init_size = boot_params->hdr.init_size; u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - char *ptr; + unsigned long cmd_line, cmd_line_size; /* * Avoid the region that is unsafe to overlap during @@ -401,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, */ mem_avoid[MEM_AVOID_ZO_RANGE].start = input; mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; - add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, - mem_avoid[MEM_AVOID_ZO_RANGE].size); /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; @@ -414,22 +408,17 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* No need to set mapping for initrd, it will be handled in VO. */ /* Avoid kernel command line. */ - cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; - cmd_line |= boot_params->hdr.cmd_line_ptr; + cmd_line = get_cmd_line_ptr(); /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++];) - ; - mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; - mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; - add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, - mem_avoid[MEM_AVOID_CMDLINE].size); + if (cmd_line) { + cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + } /* Avoid boot parameters. */ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); - add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, - mem_avoid[MEM_AVOID_BOOTPARAMS].size); /* We don't need to set a mapping for setup_data. */ @@ -438,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* Enumerate the immovable memory regions */ num_immovable_mem = count_immovable_mem_regions(); - -#ifdef CONFIG_X86_VERBOSE_BOOTUP - /* Make sure video RAM can be used. */ - add_identity_map(0, PMD_SIZE); -#endif } /* @@ -454,7 +438,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, { int i; struct setup_data *ptr; - unsigned long earliest = img->start + img->size; + u64 earliest = img->start + img->size; bool is_overlapping = false; for (i = 0; i < MEM_AVOID_MAX; i++) { @@ -499,18 +483,16 @@ static bool mem_avoid_overlap(struct mem_vector *img, } struct slot_area { - unsigned long addr; - int num; + u64 addr; + unsigned long num; }; #define MAX_SLOT_AREA 100 static struct slot_area slot_areas[MAX_SLOT_AREA]; - +static unsigned int slot_area_index; static unsigned long slot_max; -static unsigned long slot_area_index; - static void store_slot_info(struct mem_vector *region, unsigned long image_size) { struct slot_area slot_area; @@ -519,13 +501,10 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) return; slot_area.addr = region->start; - slot_area.num = (region->size - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; - if (slot_area.num > 0) { - slot_areas[slot_area_index++] = slot_area; - slot_max += slot_area.num; - } + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; } /* @@ -535,57 +514,53 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) static void process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) { - unsigned long addr, size = 0; + u64 pud_start, pud_end; + unsigned long gb_huge_pages; struct mem_vector tmp; - int i = 0; - if (!max_gb_huge_pages) { + if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { store_slot_info(region, image_size); return; } - addr = ALIGN(region->start, PUD_SIZE); - /* Did we raise the address above the passed in memory entry? */ - if (addr < region->start + region->size) - size = region->size - (addr - region->start); - - /* Check how many 1GB huge pages can be filtered out: */ - while (size > PUD_SIZE && max_gb_huge_pages) { - size -= PUD_SIZE; - max_gb_huge_pages--; - i++; - } + /* Are there any 1GB pages in the region? */ + pud_start = ALIGN(region->start, PUD_SIZE); + pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); /* No good 1GB huge pages found: */ - if (!i) { + if (pud_start >= pud_end) { store_slot_info(region, image_size); return; } - /* - * Skip those 'i'*1GB good huge pages, and continue checking and - * processing the remaining head or tail part of the passed region - * if available. - */ - - if (addr >= region->start + image_size) { + /* Check if the head part of the region is usable. */ + if (pud_start >= region->start + image_size) { tmp.start = region->start; - tmp.size = addr - region->start; + tmp.size = pud_start - region->start; store_slot_info(&tmp, image_size); } - size = region->size - (addr - region->start) - i * PUD_SIZE; - if (size >= image_size) { - tmp.start = addr + i * PUD_SIZE; - tmp.size = size; + /* Skip the good 1GB pages. */ + gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; + if (gb_huge_pages > max_gb_huge_pages) { + pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); + max_gb_huge_pages = 0; + } else { + max_gb_huge_pages -= gb_huge_pages; + } + + /* Check if the tail part of the region is usable. */ + if (region->start + region->size >= pud_end + image_size) { + tmp.start = pud_end; + tmp.size = region->start + region->size - pud_end; store_slot_info(&tmp, image_size); } } -static unsigned long slots_fetch_random(void) +static u64 slots_fetch_random(void) { unsigned long slot; - int i; + unsigned int i; /* Handle case of no slots stored. */ if (slot_max == 0) @@ -598,7 +573,7 @@ static unsigned long slots_fetch_random(void) slot -= slot_areas[i].num; continue; } - return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; + return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); } if (i == slot_area_index) @@ -611,49 +586,23 @@ static void __process_mem_region(struct mem_vector *entry, unsigned long image_size) { struct mem_vector region, overlap; - unsigned long start_orig, end; - struct mem_vector cur_entry; - - /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) - return; + u64 region_end; - /* Ignore entries entirely below our minimum. */ - if (entry->start + entry->size < minimum) - return; - - /* Ignore entries above memory limit */ - end = min(entry->size + entry->start, mem_limit); - if (entry->start >= end) - return; - cur_entry.start = entry->start; - cur_entry.size = end - entry->start; - - region.start = cur_entry.start; - region.size = cur_entry.size; + /* Enforce minimum and memory limit. */ + region.start = max_t(u64, entry->start, minimum); + region_end = min(entry->start + entry->size, mem_limit); /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { - start_orig = region.start; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above the passed in memory entry? */ - if (region.start > cur_entry.start + cur_entry.size) + if (region.start > region_end) return; /* Reduce size by any delta from the original address. */ - region.size -= region.start - start_orig; - - /* On 32-bit, reduce region size to fit within max size. */ - if (IS_ENABLED(CONFIG_X86_32) && - region.start + region.size > KERNEL_IMAGE_SIZE) - region.size = KERNEL_IMAGE_SIZE - region.start; + region.size = region_end - region.start; /* Return if region can't contain decompressed kernel */ if (region.size < image_size) @@ -666,27 +615,19 @@ static void __process_mem_region(struct mem_vector *entry, } /* Store beginning of region if holds at least image_size. */ - if (overlap.start > region.start + image_size) { - struct mem_vector beginning; - - beginning.start = region.start; - beginning.size = overlap.start - region.start; - process_gb_huge_pages(&beginning, image_size); + if (overlap.start >= region.start + image_size) { + region.size = overlap.start - region.start; + process_gb_huge_pages(®ion, image_size); } - /* Return if overlap extends to or past end of region. */ - if (overlap.start + overlap.size >= region.start + region.size) - return; - /* Clip off the overlapping region and start over. */ - region.size -= overlap.start - region.start + overlap.size; region.start = overlap.start + overlap.size; } } static bool process_mem_region(struct mem_vector *region, - unsigned long long minimum, - unsigned long long image_size) + unsigned long minimum, + unsigned long image_size) { int i; /* @@ -709,7 +650,7 @@ static bool process_mem_region(struct mem_vector *region, * immovable memory and @region. */ for (i = 0; i < num_immovable_mem; i++) { - unsigned long long start, end, entry_end, region_end; + u64 start, end, entry_end, region_end; struct mem_vector entry; if (!mem_overlaps(region, &immovable_mem[i])) @@ -736,8 +677,8 @@ static bool process_mem_region(struct mem_vector *region, #ifdef CONFIG_EFI /* - * Returns true if mirror region found (and must have been processed - * for slots adding) + * Returns true if we processed the EFI memmap, which we prefer over the E820 + * table if it is available. */ static bool process_efi_entries(unsigned long minimum, unsigned long image_size) @@ -839,20 +780,30 @@ static void process_e820_entries(unsigned long minimum, static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { + u64 phys_addr; + + /* Bail out early if it's impossible to succeed. */ + if (minimum + image_size > mem_limit) + return 0; + /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0; } - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + if (!process_efi_entries(minimum, image_size)) + process_e820_entries(minimum, image_size); - if (process_efi_entries(minimum, image_size)) - return slots_fetch_random(); + phys_addr = slots_fetch_random(); - process_e820_entries(minimum, image_size); - return slots_fetch_random(); + /* Perform a final check to make sure the address is in range. */ + if (phys_addr < minimum || phys_addr + image_size > mem_limit) { + warn("Invalid physical address chosen!\n"); + return 0; + } + + return (unsigned long)phys_addr; } static unsigned long find_random_virt_addr(unsigned long minimum, @@ -860,18 +811,12 @@ static unsigned long find_random_virt_addr(unsigned long minimum, { unsigned long slots, random_addr; - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Align image_size for easy slot calculations. */ - image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); - /* * There are how many CONFIG_PHYSICAL_ALIGN-sized slots * that can hold image_size within the range of minimum to * KERNEL_IMAGE_SIZE? */ - slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; random_addr = kaslr_get_random_long("Virtual") % slots; @@ -895,18 +840,12 @@ void choose_random_location(unsigned long input, return; } -#ifdef CONFIG_X86_5LEVEL - if (__read_cr4() & X86_CR4_LA57) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - } -#endif - boot_params->hdr.loadflags |= KASLR_FLAG; - /* Prepare to add new identity pagetables on demand. */ - initialize_identity_maps(); + if (IS_ENABLED(CONFIG_X86_32)) + mem_limit = KERNEL_IMAGE_SIZE; + else + mem_limit = MAXMEM; /* Record the various known unsafe memory ranges. */ mem_avoid_init(input, input_size, *output); @@ -917,6 +856,8 @@ void choose_random_location(unsigned long input, * location: */ min_addr = min(*output, 512UL << 20); + /* Make sure minimum is aligned. */ + min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); @@ -924,19 +865,8 @@ void choose_random_location(unsigned long input, warn("Physical KASLR disabled: no suitable memory region!"); } else { /* Update the new physical address location. */ - if (*output != random_addr) { - add_identity_map(random_addr, output_size); + if (*output != random_addr) *output = random_addr; - } - - /* - * This loads the identity mapping page table. - * This should only be done if a new physical address - * is found for the kernel, otherwise we should keep - * the old page table to make it be like the "nokaslr" - * case. - */ - finalize_identity_maps(); } diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c deleted file mode 100644 index f9c5c13d979b..000000000000 --- a/arch/x86/boot/compressed/kaslr_64.c +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code is used on x86_64 to create page table identity mappings on - * demand by building up a new set of page tables (or appending to the - * existing ones), and then switching over to them when ready. - * - * Copyright (C) 2015-2016 Yinghai Lu - * Copyright (C) 2016 Kees Cook - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - -#include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include <linux/pgtable.h> -#include <asm/init.h> -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" - -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { - unsigned char *pgt_buf; - unsigned long pgt_buf_size; - unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ - struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; - unsigned char *entry; - - /* Validate there is space available for a new page. */ - if (pages->pgt_buf_offset >= pages->pgt_buf_size) { - debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); - debug_putaddr(pages->pgt_buf_offset); - debug_putaddr(pages->pgt_buf_size); - return NULL; - } - - entry = pages->pgt_buf + pages->pgt_buf_offset; - pages->pgt_buf_offset += PAGE_SIZE; - - return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - -/* The top level page table entry pointer. */ -static unsigned long top_level_pgt; - -phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; - -/* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Due to relocation, pointers must be assigned at run time not build time. - */ -static struct x86_mapping_info mapping_info; - -/* Locates and clears a region for a new top level page table. */ -void initialize_identity_maps(void) -{ - /* If running as an SEV guest, the encryption mask is required. */ - set_sev_encryption_mask(); - - /* Exclude the encryption mask from __PHYSICAL_MASK */ - physical_mask &= ~sme_me_mask; - - /* Init mapping_info with run-time function/buffer pointers. */ - mapping_info.alloc_pgt_page = alloc_pgt_page; - mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE; - - /* - * It should be impossible for this not to already be true, - * but since calling this a second time would rewind the other - * counters, let's just make sure this is reset too. - */ - pgt_data.pgt_buf_offset = 0; - - /* - * If we came here via startup_32(), cr3 will be _pgtable already - * and we must append to the existing area instead of entirely - * overwriting it. - * - * With 5-level paging, we use '_pgtable' to allocate the p4d page table, - * the top-level page table is allocated separately. - * - * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level - * cases. On 4-level paging it's equal to 'top_level_pgt'. - */ - top_level_pgt = read_cr3_pa(); - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { - debug_putstr("booted via startup_32()\n"); - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - } else { - debug_putstr("booted via startup_64()\n"); - pgt_data.pgt_buf = _pgtable; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); - } -} - -/* - * Adds the specified range to what will become the new identity mappings. - * Once all ranges have been added, the new mapping is activated by calling - * finalize_identity_maps() below. - */ -void add_identity_map(unsigned long start, unsigned long size) -{ - unsigned long end = start + size; - - /* Align boundary to 2M. */ - start = round_down(start, PMD_SIZE); - end = round_up(end, PMD_SIZE); - if (start >= end) - return; - - /* Build the mapping. */ - kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, - start, end); -} - -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e478e40fbe5a..267e7f93050e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, parse_elf(output); handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); + + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 726e264410ff..6d31f1b4c4d1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,6 +23,7 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> +#include <asm/desc_defs.h> #define BOOT_CTYPE_H #include <linux/acpi.h> @@ -36,6 +37,9 @@ #define memptr unsigned #endif +/* boot/compressed/vmlinux start and end markers */ +extern char _head[], _end[]; + /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; @@ -70,8 +74,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); struct mem_vector { - unsigned long long start; - unsigned long long size; + u64 start; + u64 size; }; #if CONFIG_RANDOMIZE_BASE @@ -81,8 +85,6 @@ void choose_random_location(unsigned long input, unsigned long *output, unsigned long output_size, unsigned long *virt_addr); -/* cpuflags.c */ -bool has_cpuflag(int flag); #else static inline void choose_random_location(unsigned long input, unsigned long input_size, @@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input, } #endif +/* cpuflags.c */ +bool has_cpuflag(int flag); + #ifdef CONFIG_X86_64 -void initialize_identity_maps(void); -void add_identity_map(unsigned long start, unsigned long size); -void finalize_identity_maps(void); +extern int set_page_decrypted(unsigned long address); +extern int set_page_encrypted(unsigned long address); +extern int set_page_non_present(unsigned long address); extern unsigned char _pgtable[]; -#else -static inline void initialize_identity_maps(void) -{ } -static inline void add_identity_map(unsigned long start, unsigned long size) -{ } -static inline void finalize_identity_maps(void) -{ } #endif #ifdef CONFIG_EARLY_PRINTK @@ -119,6 +117,17 @@ static inline void console_init(void) void set_sev_encryption_mask(void); +#ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_es_shutdown_ghcb(void); +extern bool sev_es_check_ghcb_fault(unsigned long address); +#else +static inline void sev_es_shutdown_ghcb(void) { } +static inline bool sev_es_check_ghcb_fault(unsigned long address) +{ + return false; +} +#endif + /* acpi.c */ #ifdef CONFIG_ACPI acpi_physical_address get_rsdp_addr(void); @@ -133,4 +142,21 @@ int count_immovable_mem_regions(void); static inline int count_immovable_mem_regions(void) { return 0; } #endif +/* ident_map_64.c */ +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; +#endif + +/* Used by PAGE_KERN* macros: */ +extern pteval_t __default_kernel_pte_mask; + +/* idt_64.c */ +extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; +extern struct desc_ptr boot_idt_desc; + +/* IDT Entry Points */ +void boot_page_fault(void); +void boot_stage1_vc(void); +void boot_stage2_vc(void); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 7e01248765b2..52aa56cdbacc 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -60,6 +60,12 @@ int main(int argc, char *argv[]) printf(".incbin \"%s\"\n", argv[1]); printf("input_data_end:\n"); + printf(".section \".rodata\",\"a\",@progbits\n"); + printf(".globl input_len\n"); + printf("input_len:\n\t.long %lu\n", ilen); + printf(".globl output_len\n"); + printf("output_len:\n\t.long %lu\n", (unsigned long)olen); + retval = 0; bail: if (f) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index c8862696a47b..2a78746f5a4c 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -5,18 +5,16 @@ #include "pgtable.h" #include "../string.h" -/* - * __force_order is used by special_insns.h asm code to force instruction - * serialization. - * - * It is not referenced from the code, but GCC < 5 with -fPIE would fail - * due to an undefined symbol. Define it to make these ancient GCCs work. - */ -unsigned long __force_order; - #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ +#ifdef CONFIG_X86_5LEVEL +/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ +unsigned int __section(".data") __pgtable_l5_enabled; +unsigned int __section(".data") pgdir_shift = 39; +unsigned int __section(".data") ptrs_per_p4d = 1; +#endif + struct paging_config { unsigned long trampoline_start; unsigned long l5_required; @@ -32,7 +30,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; * Avoid putting the pointer into .bss as it will be cleared between * paging_prepare() and extract_kernel(). */ -unsigned long *trampoline_32bit __section(.data); +unsigned long *trampoline_32bit __section(".data"); extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); @@ -207,4 +205,13 @@ void cleanup_trampoline(void *pgtable) /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); + + /* Initialize variables for 5-level paging */ +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif } diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c new file mode 100644 index 000000000000..954cb2702e23 --- /dev/null +++ b/arch/x86/boot/compressed/sev-es.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include <asm/pgtable_types.h> +#include <asm/sev-es.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/msr-index.h> +#include <asm/fpu/xcr.h> +#include <asm/ptrace.h> +#include <asm/svm.h> + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ + int i; + + insn_get_prefixes(insn); + + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_byte_t p = insn->prefixes.bytes[i]; + + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + unsigned long low, high; + + asm volatile("rdmsr" : "=a" (low), "=d" (high) : + "c" (MSR_AMD64_SEV_ES_GHCB)); + + return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = val & 0xffffffffUL; + high = val >> 32; + + asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), + "a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); + insn_get_length(&ctxt->insn); + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x) ((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-es-shared.c" + +static bool early_setup_sev_es(void) +{ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + inat_init_tables(); + + return true; +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_sev_es()) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) { + vc_finish_insn(&ctxt); + } else if (result != ES_RETRY) { + /* + * For now, just halt the machine. That makes debugging easier, + * later we just call sev_es_terminate() here. + */ + while (true) + asm volatile("hlt\n"); + } +} diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 8f1025d1f681..112b2375d021 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -42,12 +42,6 @@ SECTIONS *(.rodata.*) _erodata = . ; } - .got : { - _got = .; - KEEP(*(.got.plt)) - KEEP(*(.got)) - _egot = .; - } .data : { _data = . ; *(.data) @@ -75,5 +69,49 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + DISCARDS + /DISCARD/ : { + *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss) + *(.hash) *(.gnu.hash) + *(.note.*) + } + + .got.plt (INFO) : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") } diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 24c95522f231..49546c247ae2 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -20,7 +20,7 @@ SECTIONS .initdata : { *(.initdata) } __end_init = .; - .text : { *(.text) } + .text : { *(.text .text.*) } .text32 : { *(.text32) } . = ALIGN(16); diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index c8b8c1a8d1fc..a3725ad46c5a 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -416,8 +416,6 @@ int main(int argc, char ** argv) /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); - printf("Setup is %d bytes (padded to %d bytes).\n", c, i); - /* Open and stat the kernel file */ fd = open(argv[2], O_RDONLY); if (fd < 0) @@ -425,7 +423,6 @@ int main(int argc, char ** argv) if (fstat(fd, &sb)) die("Unable to stat `%s': %m", argv[2]); sz = sb.st_size; - printf("System is %d kB\n", (sz+1023)/1024); kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); if (kernel == MAP_FAILED) die("Unable to mmap '%s': %m", argv[2]); @@ -488,7 +485,6 @@ int main(int argc, char ** argv) } /* Write the CRC */ - printf("CRC %x\n", crc); put_unaligned_le32(crc, buf); if (fwrite(buf, 1, 4, dest) != 4) die("Writing CRC failed"); diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 1fedabdb95ad..f7eb976b0a4b 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -25,7 +25,7 @@ int early_serial_base; * error during initialization. */ -static void __attribute__((section(".inittext"))) serial_putchar(int ch) +static void __section(".inittext") serial_putchar(int ch) { unsigned timeout = 0xffff; @@ -35,7 +35,7 @@ static void __attribute__((section(".inittext"))) serial_putchar(int ch) outb(ch, early_serial_base + TXR); } -static void __attribute__((section(".inittext"))) bios_putchar(int ch) +static void __section(".inittext") bios_putchar(int ch) { struct biosregs ireg; @@ -47,7 +47,7 @@ static void __attribute__((section(".inittext"))) bios_putchar(int ch) intcall(0x10, &ireg, NULL); } -void __attribute__((section(".inittext"))) putchar(int ch) +void __section(".inittext") putchar(int ch) { if (ch == '\n') putchar('\r'); /* \n -> \r\n */ @@ -58,7 +58,7 @@ void __attribute__((section(".inittext"))) putchar(int ch) serial_putchar(ch); } -void __attribute__((section(".inittext"))) puts(const char *str) +void __section(".inittext") puts(const char *str) { while (*str) putchar(*str++); diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index cbf7fed22441..04bde0bb2003 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -78,7 +78,7 @@ struct card_info { u16 xmode_n; /* Size of unprobed mode range */ }; -#define __videocard struct card_info __attribute__((used,section(".videocards"))) +#define __videocard struct card_info __section(".videocards") __attribute__((used)) extern struct card_info video_cards[], video_cards_end[]; int mode_defined(u16 mode); /* video.c */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index d7577fece9eb..78210793d357 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -19,6 +19,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +# CONFIG_64BIT is not set CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y @@ -186,7 +187,6 @@ CONFIG_DRM_I915=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f85600143747..9936528e1939 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -181,7 +181,6 @@ CONFIG_DRM_I915=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 6737bcea1fa1..c025a01cf708 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -11,6 +11,7 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/cpufeature.h> #include <asm/fpu/api.h> diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index e67a59130025..7b3a1cf0984b 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -12,6 +12,7 @@ #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index d2d069bd459b..feccb5254c7e 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -28,9 +28,9 @@ #define SCALE_F sizeof(unsigned long) #ifdef CONFIG_X86_64 -#define REX_PRE "0x48, " +#define CRC32_INST "crc32q %1, %q0" #else -#define REX_PRE +#define CRC32_INST "crc32l %1, %0" #endif #ifdef CONFIG_X86_64 @@ -48,11 +48,8 @@ asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) { while (length--) { - __asm__ __volatile__( - ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" - :"=S"(crc) - :"0"(crc), "c"(*data) - ); + asm("crc32b %1, %0" + : "+r" (crc) : "rm" (*data)); data++; } @@ -66,11 +63,8 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len unsigned long *ptmp = (unsigned long *)p; while (iquotient--) { - __asm__ __volatile__( - ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" - :"=S"(crc) - :"0"(crc), "c"(*ptmp) - ); + asm(CRC32_INST + : "+r" (crc) : "rm" (*ptmp)); ptmp++; } diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 8acbb6584a37..5af8021b98ce 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -11,6 +11,7 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/scatterlist.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -45,11 +46,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) asm volatile( /* Clear registers to propagate the carry bit */ - " xor %%r8, %%r8;" - " xor %%r9, %%r9;" - " xor %%r10, %%r10;" - " xor %%r11, %%r11;" - " xor %1, %1;" + " xor %%r8d, %%r8d;" + " xor %%r9d, %%r9d;" + " xor %%r10d, %%r10d;" + " xor %%r11d, %%r11d;" + " xor %k1, %k1;" /* Begin addition chain */ " addq 0(%3), %0;" @@ -93,7 +94,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) " cmovc %0, %%rax;" /* Step 2: Add carry*38 to the original sum */ - " xor %%rcx, %%rcx;" + " xor %%ecx, %%ecx;" " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" @@ -165,28 +166,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" @@ -200,7 +201,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" - " xor %3, %3;" + " xor %k3, %k3;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" @@ -246,28 +247,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" @@ -277,29 +278,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Compute src1[0] * src2 */ " movq 32(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);" + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 40(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);" + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 48(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);" + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 56(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);" + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);" @@ -312,7 +313,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" - " xor %3, %3;" + " xor %k3, %k3;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" @@ -345,7 +346,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" - " xor %3, %3;" + " xor %k3, %k3;" " adoxq 64(%1), %%r8;" " mulxq 104(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" @@ -516,7 +517,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) /* Step 1: Compute all partial products */ " movq 0(%1), %%rdx;" /* f[0] */ - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%1), %%rdx;" /* f[3] */ @@ -526,7 +527,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ - " xor %%r15, %%r15;" + " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" @@ -563,7 +564,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" - " xor %%rcx, %%rcx;" + " xor %%ecx, %%ecx;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" @@ -607,7 +608,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) asm volatile( /* Step 1: Compute all partial products */ " movq 0(%1), %%rdx;" /* f[0] */ - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%1), %%rdx;" /* f[3] */ @@ -617,7 +618,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ - " xor %%r15, %%r15;" + " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" @@ -647,7 +648,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) /* Step 1: Compute all partial products */ " movq 32(%1), %%rdx;" /* f[0] */ - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 56(%1), %%rdx;" /* f[3] */ @@ -657,7 +658,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ - " xor %%r15, %%r15;" + " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" @@ -692,7 +693,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" - " xor %%rcx, %%rcx;" + " xor %%ecx, %%ecx;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" @@ -725,7 +726,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" - " xor %%rcx, %%rcx;" + " xor %%ecx, %%ecx;" " adoxq 64(%1), %%r8;" " mulxq 104(%1), %%r9, %%rbx;" " adcx %%r13, %%r9;" diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 80fcb85736e1..8ea5ab0f1ca7 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -10,6 +10,7 @@ #include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index cc6b7c1a2705..2b353d42ed13 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -10,6 +10,7 @@ #include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 137edcf038cb..7d568012cc15 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -246,7 +246,7 @@ $code.=<<___ if (!$kernel); ___ &declare_function("poly1305_init_x86_64", 32, 3); $code.=<<___; - xor %rax,%rax + xor %eax,%eax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) @@ -2853,7 +2853,7 @@ $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 poly1305_init_base2_44: - xor %rax,%rax + xor %eax,%eax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) @@ -3947,7 +3947,7 @@ xor128_decrypt_n_pad: mov \$16,$len sub %r10,$len xor %eax,%eax - xor %r11,%r11 + xor %r11d,%r11d .Loop_dec_byte: mov ($inp,$otp),%r11b mov ($otp),%al @@ -4085,7 +4085,7 @@ avx_handler: .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index dfe921efa9b2..e508dbd91813 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -11,6 +11,7 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/intel-family.h> #include <asm/simd.h> @@ -157,9 +158,6 @@ static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, dctx->s[1] = get_unaligned_le32(&inp[4]); dctx->s[2] = get_unaligned_le32(&inp[8]); dctx->s[3] = get_unaligned_le32(&inp[12]); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - acc += POLY1305_BLOCK_SIZE; dctx->sset = true; } } diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index ae9b0d4615b3..07a9331d55e7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,7 +6,6 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> -#include <asm/inst.h> /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2f84c7ca74ea..870efeec8bda 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -299,7 +299,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) old_regs = set_irq_regs(regs); instrumentation_begin(); - run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); instrumentation_begin(); set_irq_regs(old_regs); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70dea9337816..cad08703c4ad 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -46,13 +46,13 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL SYM_CODE_START(native_usergs_sysret64) UNWIND_HINT_EMPTY swapgs sysretq SYM_CODE_END(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. @@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ @@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) .endm +#ifdef CONFIG_AMD_MEM_ENCRYPT +/** + * idtentry_vc - Macro to generate entry stub for #VC + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #VC. The #VC handler + * runs on an IST stack and needs to be able to cause nested #VC exceptions. + * + * To make this work the #VC entry code tries its best to pretend it doesn't use + * an IST stack by switching to the task stack if coming from user-space (which + * includes early SYSCALL entry path) or back to the stack in the IRET frame if + * entered from kernel-mode. + * + * If entered from kernel-mode the return stack is validated first, and if it is + * not safe to use (e.g. because it points to the entry stack) the #VC handler + * will switch to a fall-back stack (VC2) and call a special handler function. + * + * The macro is only used for one vector, but it is planned to be extended in + * the future for the #HV exception. + */ +.macro idtentry_vc vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS + ASM_CLAC + + /* + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + + UNWIND_HINT_REGS + + /* + * Switch off the IST stack to make it free for nested exceptions. The + * vc_switch_off_ist() function will switch back to the interrupted + * stack if it is safe to do so. If not it switches to the VC fall-back + * stack. + */ + movq %rsp, %rdi /* pt_regs pointer */ + call vc_switch_off_ist + movq %rax, %rsp /* Switch to new stack */ + + UNWIND_HINT_REGS + + /* Update pt_regs */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + + movq %rsp, %rdi /* pt_regs pointer */ + + call \cfunc + + /* + * No need to switch back to the IST stack. The current stack is either + * identical to the stack in the IRET frame or the VC fall-back stack, + * so it is definitly mapped even with PTI enabled. + */ + jmp paranoid_exit + + /* Switch to the regular task stack */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body safe_stack_\cfunc, has_error_code=1 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm +#endif + /* * Double fault entry. Straight paranoid. No checks from which context * this comes because for the espfix induced #DF this would do the wrong @@ -682,6 +762,8 @@ SYM_CODE_END(.Lbad_gs) * rdx: Function argument (can be NULL if none) */ SYM_FUNC_START(asm_call_on_stack) +SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) +SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) /* * Save the frame pointer unconditionally. This allows the ORC * unwinder to handle the stack switch. @@ -840,8 +922,9 @@ SYM_CODE_START_LOCAL(paranoid_entry) * retrieve and set the current CPUs kernel GSBASE. The stored value * has to be restored in paranoid_exit unconditionally. * - * The MSR write ensures that no subsequent load is based on a - * mispredicted GSBASE. No extra FENCE required. + * The unconditional write to GS base below ensures that no subsequent + * loads based on a mispredicted GS base can happen, therefore no LFENCE + * is needed here. */ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx ret diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 1583831f61a9..f2fe0a33bcfd 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -12,8 +12,13 @@ * Reuse the 64-bit entry points for the x32 versions that occupy different * slots in the syscall table. */ +#define __x32_sys_readv __x64_sys_readv +#define __x32_sys_writev __x64_sys_writev #define __x32_sys_getsockopt __x64_sys_getsockopt #define __x32_sys_setsockopt __x64_sys_setsockopt +#define __x32_sys_vmsplice __x64_sys_vmsplice +#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv +#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev #define __SYSCALL_64(nr, sym) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9d1102873666..0d0667a9fbd7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -32,7 +32,7 @@ 18 i386 oldstat sys_stat 19 i386 lseek sys_lseek compat_sys_lseek 20 i386 getpid sys_getpid -21 i386 mount sys_mount compat_sys_mount +21 i386 mount sys_mount 22 i386 umount sys_oldumount 23 i386 setuid sys_setuid16 24 i386 getuid sys_getuid16 @@ -142,7 +142,7 @@ 128 i386 init_module sys_init_module 129 i386 delete_module sys_delete_module 130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl compat_sys_quotactl32 +131 i386 quotactl sys_quotactl 132 i386 getpgid sys_getpgid 133 i386 fchdir sys_fchdir 134 i386 bdflush sys_bdflush @@ -156,8 +156,8 @@ 142 i386 _newselect sys_select compat_sys_select 143 i386 flock sys_flock 144 i386 msync sys_msync -145 i386 readv sys_readv compat_sys_readv -146 i386 writev sys_writev compat_sys_writev +145 i386 readv sys_readv +146 i386 writev sys_writev 147 i386 getsid sys_getsid 148 i386 fdatasync sys_fdatasync 149 i386 _sysctl sys_ni_syscall @@ -327,7 +327,7 @@ 313 i386 splice sys_splice 314 i386 sync_file_range sys_ia32_sync_file_range 315 i386 tee sys_tee -316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +316 i386 vmsplice sys_vmsplice 317 i386 move_pages sys_move_pages compat_sys_move_pages 318 i386 getcpu sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait @@ -358,8 +358,8 @@ 344 i386 syncfs sys_syncfs 345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg 346 i386 setns sys_setns -347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +347 i386 process_vm_readv sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr @@ -444,3 +444,4 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f30d6ae9a688..1f47e24fb65c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,7 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact @@ -371,8 +372,8 @@ 512 x32 rt_sigaction compat_sys_rt_sigaction 513 x32 rt_sigreturn compat_sys_x32_rt_sigreturn 514 x32 ioctl compat_sys_ioctl -515 x32 readv compat_sys_readv -516 x32 writev compat_sys_writev +515 x32 readv sys_readv +516 x32 writev sys_writev 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg @@ -388,15 +389,15 @@ 529 x32 waitid compat_sys_waitid 530 x32 set_robust_list compat_sys_set_robust_list 531 x32 get_robust_list compat_sys_get_robust_list -532 x32 vmsplice compat_sys_vmsplice +532 x32 vmsplice sys_vmsplice 533 x32 move_pages compat_sys_move_pages 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 537 x32 recvmmsg compat_sys_recvmmsg_time64 538 x32 sendmmsg compat_sys_sendmmsg -539 x32 process_vm_readv compat_sys_process_vm_readv -540 x32 process_vm_writev compat_sys_process_vm_writev +539 x32 process_vm_readv sys_process_vm_readv +540 x32 process_vm_writev sys_process_vm_writev 541 x32 setsockopt sys_setsockopt 542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 215376d975a2..21243747965d 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -9,8 +9,6 @@ ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE include $(srctree)/lib/vdso/Makefile -KBUILD_CFLAGS += $(DISABLE_LTO) - # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n UBSAN_SANITIZE := n @@ -176,7 +174,7 @@ quiet_cmd_vdso = VDSO $@ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -shared --hash-style=both --build-id \ +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ $(call ld-option, --eh-frame-hdr) -Bsymbolic GCOV_PROFILE := n diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 84a4a73f77f7..283ed9d00426 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -14,6 +14,7 @@ #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS +#undef CONFIG_PARAVIRT_XXL #define CONFIG_X86_32 1 #define CONFIG_PGTABLE_LEVELS 2 diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 26c36357c4c9..40669eac9d6d 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -89,6 +89,7 @@ struct perf_ibs { u64 max_period; unsigned long offset_mask[1]; int offset_max; + unsigned int fetch_count_reset_broken : 1; struct cpu_perf_ibs __percpu *pcpu; struct attribute **format_attrs; @@ -334,11 +335,18 @@ static u64 get_ibs_op_count(u64 config) { u64 count = 0; - if (config & IBS_OP_VAL) - count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ - - if (ibs_caps & IBS_CAPS_RDWROPCNT) - count += (config & IBS_OP_CUR_CNT) >> 32; + /* + * If the internal 27-bit counter rolled over, the count is MaxCnt + * and the lower 7 bits of CurCnt are randomized. + * Otherwise CurCnt has the full 27-bit current counter value. + */ + if (config & IBS_OP_VAL) { + count = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += config & IBS_OP_MAX_CNT_EXT_MASK; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { + count = (config & IBS_OP_CUR_CNT) >> 32; + } return count; } @@ -363,7 +371,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { - wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); + u64 tmp = hwc->config | config; + + if (perf_ibs->fetch_count_reset_broken) + wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); + + wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); } /* @@ -394,7 +407,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; + u64 period, config = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -403,13 +416,19 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + /* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it. */ set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + perf_ibs_enable_event(perf_ibs, hwc, config); perf_event_update_userpage(event); } @@ -577,7 +596,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, *config, period; + u64 *buf, *config, period, new_config = 0; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -626,18 +645,24 @@ fail: perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + /* + * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ if (event->attr.sample_type & PERF_SAMPLE_RAW) { - /* - * Read IbsBrTarget and IbsOpData4 separately - * depending on their availability. - * Can't add to offset_max as they are staggered - */ - if (ibs_caps & IBS_CAPS_BRNTRGT) { - rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); - size++; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } } - if (ibs_caps & IBS_CAPS_OPDATA4) { - rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { + rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); size++; } } @@ -666,13 +691,17 @@ out: if (throttle) { perf_ibs_stop(event, 0); } else { - period >>= 4; - - if ((ibs_caps & IBS_CAPS_RDWROPCNT) && - (*config & IBS_OP_CNT_CTL)) - period |= *config & IBS_OP_CUR_CNT_RAND; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; - perf_ibs_enable_event(perf_ibs, hwc, period); + perf_ibs_enable_event(perf_ibs, hwc, new_config); } perf_event_update_userpage(event); @@ -733,12 +762,26 @@ static __init void perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + /* + * Some chips fail to reset the fetch count when it is written; instead + * they need a 0-1 transition of IbsFetchEn. + */ + if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) + perf_ibs_fetch.fetch_count_reset_broken = 1; + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; *attr++ = &format_attr_cnt_ctl.attr; } + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index fb616203ce42..be50ef8572cc 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -379,7 +379,7 @@ static __init int _init_events_attrs(void) while (amd_iommu_v2_event_descs[i].attr.attr.name) i++; - attrs = kcalloc(i + 1, sizeof(struct attribute **), GFP_KERNEL); + attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return -ENOMEM; diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 76400c052b0e..7f014d450bc2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -181,28 +181,28 @@ static void amd_uncore_del(struct perf_event *event, int flags) } /* - * Convert logical CPU number to L3 PMC Config ThreadMask format + * Return a full thread and slice mask unless user + * has provided them */ -static u64 l3_thread_slice_mask(int cpu) +static u64 l3_thread_slice_mask(u64 config) { - u64 thread_mask, core = topology_core_id(cpu); - unsigned int shift, thread = 0; + if (boot_cpu_data.x86 <= 0x18) + return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | + ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); - if (topology_smt_supported() && !topology_is_primary_thread(cpu)) - thread = 1; - - if (boot_cpu_data.x86 <= 0x18) { - shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread; - thread_mask = BIT_ULL(shift); - - return AMD64_L3_SLICE_MASK | thread_mask; - } - - core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK; - shift = AMD64_L3_THREAD_SHIFT + thread; - thread_mask = BIT_ULL(shift); + /* + * If the user doesn't specify a threadmask, they're not trying to + * count core 0, so we enable all cores & threads. + * We'll also assume that they want to count slice 0 if they specify + * a threadmask and leave sliceid and enallslices unpopulated. + */ + if (!(config & AMD64_L3_F19H_THREAD_MASK)) + return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_EN_ALL_CORES; - return AMD64_L3_EN_ALL_SLICES | core | thread_mask; + return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | + AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_COREID_MASK); } static int amd_uncore_event_init(struct perf_event *event) @@ -232,7 +232,7 @@ static int amd_uncore_event_init(struct perf_event *event) * For other events, the two fields do not affect the count. */ if (l3_mask && is_llc_event(event)) - hwc->config |= l3_thread_slice_mask(event->cpu); + hwc->config |= l3_thread_slice_mask(event->attr.config); uncore = event_to_amd_uncore(event); if (!uncore) @@ -274,47 +274,72 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -/* - * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based - * on family - */ -#define AMD_FORMAT_ATTR(_dev, _name, _format) \ -static ssize_t \ -_dev##_show##_name(struct device *dev, \ - struct device_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); - -/* Used for each uncore counter type */ -#define AMD_ATTRIBUTE(_name) \ -static struct attribute *amd_uncore_format_attr_##_name[] = { \ - &format_attr_event_##_name.attr, \ - &format_attr_umask.attr, \ - NULL, \ -}; \ -static struct attribute_group amd_uncore_format_group_##_name = { \ - .name = "format", \ - .attrs = amd_uncore_format_attr_##_name, \ -}; \ -static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ - &amd_uncore_attr_group, \ - &amd_uncore_format_group_##_name, \ - NULL, \ +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + +DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); +DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ + +static struct attribute *amd_uncore_df_format_attr[] = { + &format_attr_event12.attr, /* event14 if F17h+ */ + &format_attr_umask.attr, + NULL, +}; + +static struct attribute *amd_uncore_l3_format_attr[] = { + &format_attr_event12.attr, /* event8 if F17h+ */ + &format_attr_umask.attr, + NULL, /* slicemask if F17h, coreid if F19h */ + NULL, /* threadmask8 if F17h, enallslices if F19h */ + NULL, /* enallcores if F19h */ + NULL, /* sliceid if F19h */ + NULL, /* threadmask2 if F19h */ + NULL, +}; + +static struct attribute_group amd_uncore_df_format_group = { + .name = "format", + .attrs = amd_uncore_df_format_attr, }; -AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); -AMD_FORMAT_ATTR(umask, , "config:8-15"); -AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); -AMD_FORMAT_ATTR(event, _l3, "config:0-7"); -AMD_ATTRIBUTE(df); -AMD_ATTRIBUTE(l3); +static struct attribute_group amd_uncore_l3_format_group = { + .name = "format", + .attrs = amd_uncore_l3_format_attr, +}; + +static const struct attribute_group *amd_uncore_df_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_df_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_l3_format_group, + NULL, +}; static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_df_attr_groups, + .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -326,6 +351,8 @@ static struct pmu amd_nb_pmu = { static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_l3_attr_groups, + .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -529,6 +556,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu) static int __init amd_uncore_init(void) { + struct attribute **df_attr = amd_uncore_df_format_attr; + struct attribute **l3_attr = amd_uncore_l3_format_attr; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && @@ -538,6 +567,8 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; if (boot_cpu_data.x86 >= 0x17) { /* * For F17h and above, the Northbridge counters are @@ -545,27 +576,16 @@ static int __init amd_uncore_init(void) * counters are supported too. The PMUs are exported * based on family as either L2 or L3 and NB or DF. */ - num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L3; amd_nb_pmu.name = "amd_df"; amd_llc_pmu.name = "amd_l3"; - format_attr_event_df.show = &event_show_df; - format_attr_event_l3.show = &event_show_l3; l3_mask = true; - } else { - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - l3_mask = false; } - amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; - amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + if (boot_cpu_data.x86 >= 0x17) + *df_attr = &format_attr_event14.attr; + amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { ret = -ENOMEM; @@ -575,13 +595,29 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("%s NB counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + ret = 0; } if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + if (boot_cpu_data.x86 >= 0x19) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_coreid.attr; + *l3_attr++ = &format_attr_enallslices.attr; + *l3_attr++ = &format_attr_enallcores.attr; + *l3_attr++ = &format_attr_sliceid.attr; + *l3_attr++ = &format_attr_threadmask2.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_slicemask.attr; + *l3_attr++ = &format_attr_threadmask8.attr; + } + amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; @@ -591,9 +627,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("%s LLC counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); ret = 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1cbf57dc2ac8..a88c94d65693 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -28,6 +28,7 @@ #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> +#include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -52,6 +53,34 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); +/* + * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined + * from just a typename, as opposed to an actual function. + */ +DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); +DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); +DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); + +DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); +DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); +DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); + +DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); +DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); +DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); + +DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); +DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); +DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); + +DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); +DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); + +DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -76,6 +105,9 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; + if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + /* * Careful: an NMI might modify the previous event value. * @@ -660,7 +692,7 @@ static void x86_pmu_disable(struct pmu *pmu) cpuc->enabled = 0; barrier(); - x86_pmu.disable_all(); + static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) @@ -907,8 +939,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; - if (x86_pmu.start_scheduling) - x86_pmu.start_scheduling(cpuc); + static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; @@ -925,7 +956,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); + c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } @@ -1008,8 +1039,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; - if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, i, assign[i]); + static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } } else { for (i = n0; i < n; i++) { @@ -1018,19 +1048,56 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* * release events that failed scheduling */ - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, e); + static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } - if (x86_pmu.stop_scheduling) - x86_pmu.stop_scheduling(cpuc); + static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } +static int add_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) { + if (cpuc->n_metric == INTEL_TD_METRIC_NUM) + return -EINVAL; + cpuc->n_metric++; + cpuc->n_txn_metric++; + } + + return 0; +} + +static void del_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) + cpuc->n_metric--; +} + +static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, + int max_count, int n) +{ + + if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) + return -EINVAL; + + if (n >= max_count + cpuc->n_metric) + return -EINVAL; + + cpuc->event_list[n] = event; + if (is_counter_pair(&event->hw)) { + cpuc->n_pair++; + cpuc->n_txn_pair++; + } + + return 0; +} + /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code @@ -1067,28 +1134,22 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, } if (is_x86_event(leader)) { - if (n >= max_count) + if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; - cpuc->event_list[n] = leader; n++; - if (is_counter_pair(&leader->hw)) - cpuc->n_pair++; } + if (!dogrp) return n; for_each_sibling_event(event, leader) { - if (!is_x86_event(event) || - event->state <= PERF_EVENT_STATE_OFF) + if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; - if (n >= max_count) + if (collect_event(cpuc, event, max_count, n)) return -EINVAL; - cpuc->event_list[n] = event; n++; - if (is_counter_pair(&event->hw)) - cpuc->n_pair++; } return n; } @@ -1110,11 +1171,16 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; break; + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: + /* All the metric events are mapped onto the fixed counter 3. */ + idx = INTEL_PMC_IDX_FIXED_SLOTS; + /* fall through */ case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | + INTEL_PMC_FIXED_RDPMC_BASE; break; default: @@ -1226,7 +1292,7 @@ static void x86_pmu_enable(struct pmu *pmu) cpuc->enabled = 1; barrier(); - x86_pmu.enable_all(added); + static_call(x86_pmu_enable_all)(added); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -1245,6 +1311,10 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; + if (unlikely(is_topdown_count(event)) && + x86_pmu.set_topdown_event_period) + return x86_pmu.set_topdown_event_period(event); + /* * If we are way outside a reasonable range then just skip forward: */ @@ -1284,11 +1354,11 @@ int x86_perf_event_set_period(struct perf_event *event) wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* - * Clear the Merge event counter's upper 16 bits since + * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_event_addr(idx + 1), 0); + wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); /* * Due to erratum on certan cpu we need @@ -1347,7 +1417,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; - ret = x86_pmu.schedule_events(cpuc, n, assign); + ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* @@ -1365,13 +1435,11 @@ done_collect: cpuc->n_added += n - n0; cpuc->n_txn += n - n0; - if (x86_pmu.add) { - /* - * This is before x86_pmu_enable() will call x86_pmu_start(), - * so we enable LBRs before an event needs them etc.. - */ - x86_pmu.add(event); - } + /* + * This is before x86_pmu_enable() will call x86_pmu_start(), + * so we enable LBRs before an event needs them etc.. + */ + static_call_cond(x86_pmu_add)(event); ret = 0; out: @@ -1399,7 +1467,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); __set_bit(idx, cpuc->running); - x86_pmu.enable(event); + static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } @@ -1469,7 +1537,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { - x86_pmu.disable(event); + static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); @@ -1519,8 +1587,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { @@ -1529,17 +1596,18 @@ static void x86_pmu_del(struct perf_event *event, int flags) } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; + if (x86_pmu.intel_cap.perf_metrics) + del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: - if (x86_pmu.del) { - /* - * This is after x86_pmu_stop(); so we disable LBRs after any - * event can need them etc.. - */ - x86_pmu.del(event); - } + + /* + * This is after x86_pmu_stop(); so we disable LBRs after any + * event can need them etc.. + */ + static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) @@ -1617,7 +1685,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) return NMI_DONE; start_clock = sched_clock(); - ret = x86_pmu.handle_irq(regs); + ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); @@ -1830,6 +1898,38 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; +static void x86_pmu_static_call_update(void) +{ + static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); + static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); + static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); + static_call_update(x86_pmu_enable, x86_pmu.enable); + static_call_update(x86_pmu_disable, x86_pmu.disable); + + static_call_update(x86_pmu_add, x86_pmu.add); + static_call_update(x86_pmu_del, x86_pmu.del); + static_call_update(x86_pmu_read, x86_pmu.read); + + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); + static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); + static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); + + static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); + static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); + static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); + + static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); + static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); + + static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); + static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); +} + +static void _x86_pmu_read(struct perf_event *event) +{ + x86_perf_event_update(event); +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1898,6 +1998,11 @@ static int __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + if (!x86_pmu.read) + x86_pmu.read = _x86_pmu_read; + + x86_pmu_static_call_update(); + /* * Install callbacks. Core will call them for each online * cpu. @@ -1934,11 +2039,9 @@ out: } early_initcall(init_hw_perf_events); -static inline void x86_pmu_read(struct perf_event *event) +static void x86_pmu_read(struct perf_event *event) { - if (x86_pmu.read) - return x86_pmu.read(event); - x86_perf_event_update(event); + static_call(x86_pmu_read)(event); } /* @@ -1962,6 +2065,8 @@ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); + __this_cpu_write(cpu_hw_events.n_txn_pair, 0); + __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* @@ -1987,6 +2092,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); + __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } @@ -2015,7 +2122,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) if (!x86_pmu_initialized()) return -EAGAIN; - ret = x86_pmu.schedule_events(cpuc, n, assign); + ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; @@ -2208,17 +2315,15 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m static int x86_pmu_event_idx(struct perf_event *event) { - int idx = event->hw.idx; + struct hw_perf_event *hwc = &event->hw; - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; - if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { - idx -= INTEL_PMC_IDX_FIXED; - idx |= 1 << 30; - } - - return idx + 1; + if (is_metric_idx(hwc->idx)) + return INTEL_PMC_FIXED_RDPMC_METRICS + 1; + else + return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, @@ -2308,15 +2413,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.sched_task) - x86_pmu.sched_task(ctx, sched_in); + static_call_cond(x86_pmu_sched_task)(ctx, sched_in); } static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - if (x86_pmu.swap_task_ctx) - x86_pmu.swap_task_ctx(prev, next); + static_call_cond(x86_pmu_swap_task_ctx)(prev, next); } void perf_check_microcode(void) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 31e6887d24f1..f1926e9f2143 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,10 +243,14 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ @@ -309,6 +313,12 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); + static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), @@ -2165,11 +2175,24 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; + int idx = hwc->idx; - mask = 0xfULL << (idx * 4); + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * When there are other active TopDown events, + * don't disable the fixed counter 3. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + intel_clear_masks(event, idx); + + mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; wrmsrl(hwc->config_base, ctrl_val); @@ -2180,17 +2203,28 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); x86_pmu_disable_event(event); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { - intel_clear_masks(event, idx); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + return; + case INTEL_PMC_IDX_FIXED_VLBR: intel_clear_masks(event, idx); + break; + default: + intel_clear_masks(event, idx); + pr_warn("Failed to disable the event with invalid index %d\n", + idx); + return; + } /* * Needs to be called after x86_pmu_disable_event, @@ -2208,10 +2242,189 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_pebs_del(event); } +static int icl_set_topdown_event_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + + /* + * The values in PERF_METRICS MSR are derived from fixed counter 3. + * Software should start both registers, PERF_METRICS and fixed + * counter 3, from zero. + * Clear PERF_METRICS and Fixed counter 3 in initialization. + * After that, both MSRs will be cleared for each read. + * Don't need to clear them again. + */ + if (left == x86_pmu.max_period) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + hwc->saved_slots = 0; + hwc->saved_metric = 0; + } + + if ((hwc->saved_slots) && is_slots_event(event)) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); + wrmsrl(MSR_PERF_METRICS, hwc->saved_metric); + } + + perf_event_update_userpage(event); + + return 0; +} + +static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) +{ + u32 val; + + /* + * The metric is reported as an 8bit integer fraction + * suming up to 0xff. + * slots-in-metric = (Metric / 0xff) * slots + */ + val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; + return mul_u64_u32_div(slots, val, 0xff); +} + +static u64 icl_get_topdown_value(struct perf_event *event, + u64 slots, u64 metrics) +{ + int idx = event->hw.idx; + u64 delta; + + if (is_metric_idx(idx)) + delta = icl_get_metrics_event_value(metrics, slots, idx); + else + delta = slots; + + return delta; +} + +static void __icl_update_topdown_event(struct perf_event *event, + u64 slots, u64 metrics, + u64 last_slots, u64 last_metrics) +{ + u64 delta, last = 0; + + delta = icl_get_topdown_value(event, slots, metrics); + if (last_slots) + last = icl_get_topdown_value(event, last_slots, last_metrics); + + /* + * The 8bit integer fraction of metric may be not accurate, + * especially when the changes is very small. + * For example, if only a few bad_spec happens, the fraction + * may be reduced from 1 to 0. If so, the bad_spec event value + * will be 0 which is definitely less than the last value. + * Avoid update event->count for this case. + */ + if (delta > last) { + delta -= last; + local64_add(delta, &event->count); + } +} + +static void update_saved_topdown_regs(struct perf_event *event, + u64 slots, u64 metrics) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + int idx; + + event->hw.saved_slots = slots; + event->hw.saved_metric = metrics; + + for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + other->hw.saved_slots = slots; + other->hw.saved_metric = metrics; + } +} + +/* + * Update all active Topdown events. + * + * The PERF_METRICS and Fixed counter 3 are read separately. The values may be + * modify by a NMI. PMU has to be disabled before calling this function. + */ +static u64 icl_update_topdown_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + u64 slots, metrics; + bool reset = true; + int idx; + + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; + + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + + for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + __icl_update_topdown_event(other, slots, metrics, + event ? event->hw.saved_slots : 0, + event ? event->hw.saved_metric : 0); + } + + /* + * Check and update this event, which may have been cleared + * in active_mask e.g. x86_pmu_stop() + */ + if (event && !test_bit(event->hw.idx, cpuc->active_mask)) { + __icl_update_topdown_event(event, slots, metrics, + event->hw.saved_slots, + event->hw.saved_metric); + + /* + * In x86_pmu_stop(), the event is cleared in active_mask first, + * then drain the delta, which indicates context switch for + * counting. + * Save metric and slots for context switch. + * Don't need to reset the PERF_METRICS and Fixed counter 3. + * Because the values will be restored in next schedule in. + */ + update_saved_topdown_regs(event, slots, metrics); + reset = false; + } + + if (reset) { + /* The fixed counter 3 has to be written before the PERF_METRICS. */ + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + if (event) + update_saved_topdown_regs(event, 0, 0); + } + + return slots; +} + +static void intel_pmu_read_topdown_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* Only need to call update_topdown_event() once for group read. */ + if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && + !is_slots_event(event)) + return; + + perf_pmu_disable(event->pmu); + x86_pmu.update_topdown_event(event); + perf_pmu_enable(event->pmu); +} + static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); + else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); } @@ -2219,8 +2432,22 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask, bits = 0; + int idx = hwc->idx; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + /* + * When there are other active TopDown events, + * don't enable the fixed counter 3 again. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + + intel_set_masks(event, idx); /* * Enable IRQ generation (0x8), if not PEBS, @@ -2240,6 +2467,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event) if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) bits |= 0x4; + idx -= INTEL_PMC_IDX_FIXED; bits <<= (idx * 4); mask = 0xfULL << (idx * 4); @@ -2262,18 +2490,27 @@ static void intel_pmu_enable_event(struct perf_event *event) if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_set_masks(event, idx); __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { - intel_set_masks(event, idx); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + break; + case INTEL_PMC_IDX_FIXED_VLBR: intel_set_masks(event, idx); + break; + default: + pr_warn("Failed to enable the event with invalid index %d\n", + idx); + } } static void intel_pmu_add_event(struct perf_event *event) @@ -2389,7 +2626,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) { u64 pebs_enabled = cpuc->pebs_enabled; handled++; @@ -2410,7 +2647,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * Intel PT */ - if (__test_and_clear_bit(55, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && perf_guest_cbs->handle_intel_pt_intr)) @@ -2420,6 +2657,15 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* + * Intel Perf mertrics + */ + if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { + handled++; + if (x86_pmu.update_topdown_event) + x86_pmu.update_topdown_event(NULL); + } + + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status * bit. Therefore always force probe these counters. @@ -3355,6 +3601,56 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.type != PERF_TYPE_RAW) return 0; + /* + * Config Topdown slots and metric events + * + * The slots event on Fixed Counter 3 can support sampling, + * which will be handled normally in x86_perf_event_update(). + * + * Metric events don't support sampling and require being paired + * with a slots event as group leader. When the slots event + * is used in a metrics group, it too cannot support sampling. + */ + if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) { + if (event->attr.config1 || event->attr.config2) + return -EINVAL; + + /* + * The TopDown metrics events and slots event don't + * support any filters. + */ + if (event->attr.config & X86_ALL_EVENT_FLAGS) + return -EINVAL; + + if (is_metric_event(event)) { + struct perf_event *leader = event->group_leader; + + /* The metric events don't support sampling. */ + if (is_sampling_event(event)) + return -EINVAL; + + /* The metric events require a slots group leader. */ + if (!is_slots_event(leader)) + return -EINVAL; + + /* + * The leader/SLOTS must not be a sampling event for + * metric use; hardware requires it starts at 0 when used + * in conjunction with MSR_PERF_METRICS. + */ + if (is_sampling_event(leader)) + return -EINVAL; + + event->event_caps |= PERF_EV_CAP_SIBLING; + /* + * Only once we have a METRICs sibling do we + * need TopDown magic. + */ + leader->hw.flags |= PERF_X86_EVENT_TOPDOWN; + event->hw.flags |= PERF_X86_EVENT_TOPDOWN; + } + } + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0; @@ -3787,6 +4083,17 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.counter_freezing) enable_counter_freeze(); + /* Disable perf metrics if any added CPU doesn't support it. */ + if (x86_pmu.intel_cap.perf_metrics) { + union perf_capabilities perf_cap; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); + if (!perf_cap.perf_metrics) { + x86_pmu.intel_cap.perf_metrics = 0; + x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + } + } + if (!cpuc->shared_regs) return; @@ -4355,6 +4662,15 @@ static struct attribute *icl_events_attrs[] = { NULL, }; +static struct attribute *icl_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + NULL, +}; + static struct attribute *icl_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_abort), @@ -4830,6 +5146,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5139,10 +5456,13 @@ __init int intel_pmu_init(void) hsw_format_attr : nhm_format_attr; extra_skl_attr = skl_format_attr; mem_attr = icl_events_attrs; + td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.update_topdown_event = icl_update_topdown_event; + x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; pr_cont("Icelake events, "); name = "icelake"; break; @@ -5198,6 +5518,15 @@ __init int intel_pmu_init(void) * counter, so do not extend mask to generic counters */ for_each_event_constraint(c, x86_pmu.event_constraints) { + /* + * Don't extend the topdown slots and metrics + * events to the generic counters. + */ + if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + c->weight = hweight64(c->idxmsk64); + continue; + } + if (c->cmask == FIXED_EVENT_FLAGS && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; @@ -5253,6 +5582,9 @@ __init int intel_pmu_init(void) if (x86_pmu.counter_freezing) x86_pmu.handle_irq = intel_pmu_handle_irq_v4; + if (x86_pmu.intel_cap.perf_metrics) + x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 86848c57b55e..404315df1e16 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -670,9 +670,7 @@ unlock: static inline void intel_pmu_drain_pebs_buffer(void) { - struct pt_regs regs; - - x86_pmu.drain_pebs(®s); + x86_pmu.drain_pebs(NULL); } /* @@ -1737,6 +1735,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); + struct pt_regs dummy_iregs; if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { /* @@ -1749,6 +1748,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } else if (!intel_pmu_save_and_restart(event)) return; + if (!iregs) + iregs = &dummy_iregs; + while (count > 1) { setup_sample(event, iregs, at, &data, regs); perf_event_output(event, &data, regs); @@ -1758,16 +1760,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } setup_sample(event, iregs, at, &data, regs); - - /* - * All but the last records are processed. - * The last one is left to be able to call the overflow handler. - */ - if (perf_event_overflow(event, &data, regs)) { - x86_pmu_stop(event, 0); - return; + if (iregs == &dummy_iregs) { + /* + * The PEBS records may be drained in the non-overflow context, + * e.g., large PEBS + context switch. Perf should treat the + * last record the same as other PEBS records, and doesn't + * invoke the generic overflow handler. + */ + perf_event_output(event, &data, regs); + } else { + /* + * All but the last records are processed. + * The last one is left to be able to call the overflow handler. + */ + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); } - } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d5c6d3b340c5..86d012b3e0b4 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -12,6 +12,8 @@ struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; +/* The PCI driver for the device which the uncore doesn't own. */ +struct pci_driver *uncore_pci_sub_driver; /* pci bus to socket mapping */ DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); @@ -989,65 +991,71 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) } /* - * add a pci uncore device + * Get the die information of a PCI device. + * @pdev: The PCI device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. */ -static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, + int *phys_id, int *die) { - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu = NULL; - struct intel_uncore_box *box; - int phys_id, die, ret; - - phys_id = uncore_pcibus_to_physid(pdev->bus); - if (phys_id < 0) + *phys_id = uncore_pcibus_to_physid(pdev->bus); + if (*phys_id < 0) return -ENODEV; - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); - if (die < 0) + *die = (topology_max_die_per_package() > 1) ? *phys_id : + topology_phys_to_logical_pkg(*phys_id); + if (*die < 0) return -EINVAL; - if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { - int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - - uncore_extra_pci_dev[die].dev[idx] = pdev; - pci_set_drvdata(pdev, NULL); - return 0; - } - - type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + return 0; +} - /* - * Some platforms, e.g. Knights Landing, use a common PCI device ID - * for multiple instances of an uncore PMU device type. We should check - * PCI slot and func to indicate the uncore box. - */ - if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; - const struct pci_device_id *ids = pci_drv->id_table; - unsigned int devfn; - - while (ids && ids->vendor) { - if ((ids->vendor == pdev->vendor) && - (ids->device == pdev->device)) { - devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), - UNCORE_PCI_DEV_FUNC(ids->driver_data)); - if (devfn == pdev->devfn) { - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - break; - } +/* + * Find the PMU of a PCI device. + * @pdev: The PCI device. + * @ids: The ID table of the available PCI devices with a PMU. + */ +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) +{ + struct intel_uncore_pmu *pmu = NULL; + struct intel_uncore_type *type; + kernel_ulong_t data; + unsigned int devfn; + + while (ids && ids->vendor) { + if ((ids->vendor == pdev->vendor) && + (ids->device == pdev->device)) { + data = ids->driver_data; + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data), + UNCORE_PCI_DEV_FUNC(data)); + if (devfn == pdev->devfn) { + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)]; + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)]; + break; } - ids++; } - if (pmu == NULL) - return -ENODEV; - } else { - /* - * for performance monitoring unit with multiple boxes, - * each box has a different function id. - */ - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + ids++; } + return pmu; +} + +/* + * Register the PMU for a PCI device + * @pdev: The PCI device. + * @type: The corresponding PMU type of the device. + * @pmu: The corresponding PMU of the device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static int uncore_pci_pmu_register(struct pci_dev *pdev, + struct intel_uncore_type *type, + struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box; + int ret; if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) return -EINVAL; @@ -1067,7 +1075,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); - pci_set_drvdata(pdev, box); pmu->boxes[die] = box; if (atomic_inc_return(&pmu->activeboxes) > 1) @@ -1076,7 +1083,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id /* First active box registers the pmu */ ret = uncore_pmu_register(pmu); if (ret) { - pci_set_drvdata(pdev, NULL); pmu->boxes[die] = NULL; uncore_box_exit(box); kfree(box); @@ -1084,18 +1090,87 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return ret; } +/* + * add a pci uncore device + */ +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu = NULL; + int phys_id, die, ret; + + ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); + if (ret) + return ret; + + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + int idx = UNCORE_PCI_DEV_IDX(id->driver_data); + + uncore_extra_pci_dev[die].dev[idx] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + + /* + * Some platforms, e.g. Knights Landing, use a common PCI device ID + * for multiple instances of an uncore PMU device type. We should check + * PCI slot and func to indicate the uncore box. + */ + if (id->driver_data & ~0xffff) { + struct pci_driver *pci_drv = pdev->driver; + + pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); + if (pmu == NULL) + return -ENODEV; + } else { + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + } + + ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die); + + pci_set_drvdata(pdev, pmu->boxes[die]); + + return ret; +} + +/* + * Unregister the PMU of a PCI device + * @pmu: The corresponding PMU is unregistered. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box = pmu->boxes[die]; + + if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) + return; + + pmu->boxes[die] = NULL; + if (atomic_dec_return(&pmu->activeboxes) == 0) + uncore_pmu_unregister(pmu); + uncore_box_exit(box); + kfree(box); +} + static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; int i, phys_id, die; - phys_id = uncore_pcibus_to_physid(pdev->bus); + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return; box = pci_get_drvdata(pdev); if (!box) { - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { if (uncore_extra_pci_dev[die].dev[i] == pdev) { uncore_extra_pci_dev[die].dev[i] = NULL; @@ -1107,15 +1182,84 @@ static void uncore_pci_remove(struct pci_dev *pdev) } pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->dieid] = NULL; - if (atomic_dec_return(&pmu->activeboxes) == 0) - uncore_pmu_unregister(pmu); - uncore_box_exit(box); - kfree(box); + + uncore_pci_pmu_unregister(pmu, phys_id, die); +} + +static int uncore_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct intel_uncore_pmu *pmu; + int phys_id, die; + + /* Unregister the PMU when the device is going to be deleted. */ + if (action != BUS_NOTIFY_DEL_DEVICE) + return NOTIFY_DONE; + + pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table); + if (!pmu) + return NOTIFY_DONE; + + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return NOTIFY_DONE; + + uncore_pci_pmu_unregister(pmu, phys_id, die); + + return NOTIFY_OK; +} + +static struct notifier_block uncore_notifier = { + .notifier_call = uncore_bus_notify, +}; + +static void uncore_pci_sub_driver_init(void) +{ + const struct pci_device_id *ids = uncore_pci_sub_driver->id_table; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pci_sub_dev; + bool notify = false; + unsigned int devfn; + int phys_id, die; + + while (ids && ids->vendor) { + pci_sub_dev = NULL; + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)]; + /* + * Search the available device, and register the + * corresponding PMU. + */ + while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + ids->device, pci_sub_dev))) { + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), + UNCORE_PCI_DEV_FUNC(ids->driver_data)); + if (devfn != pci_sub_dev->devfn) + continue; + + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; + if (!pmu) + continue; + + if (uncore_pci_get_dev_die_info(pci_sub_dev, + &phys_id, &die)) + continue; + + if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, + phys_id, die)) + notify = true; + } + ids++; + } + + if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier)) + notify = false; + + if (!notify) + uncore_pci_sub_driver = NULL; } static int __init uncore_pci_init(void) @@ -1141,6 +1285,9 @@ static int __init uncore_pci_init(void) if (ret) goto errtype; + if (uncore_pci_sub_driver) + uncore_pci_sub_driver_init(); + pcidrv_registered = true; return 0; @@ -1158,6 +1305,8 @@ static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; + if (uncore_pci_sub_driver) + bus_unregister_notifier(&pci_bus_type, &uncore_notifier); pci_unregister_driver(uncore_pci_driver); uncore_types_exit(uncore_pci_uncores); kfree(uncore_extra_pci_dev); @@ -1478,12 +1627,12 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { }; static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_uncore_mmio_init, }; static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_l_uncore_mmio_init, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 105fdc69825e..83d2a7d490e0 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -552,6 +552,7 @@ extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct intel_uncore_type **uncore_mmio_uncores; extern struct pci_driver *uncore_pci_driver; +extern struct pci_driver *uncore_pci_sub_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; @@ -567,6 +568,7 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void tgl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 6a4ca27b2c9e..39e632ed6ca9 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -126,6 +126,10 @@ #define ICL_UNC_CBO_0_PER_CTR0 0x702 #define ICL_UNC_CBO_MSR_OFFSET 0x8 +/* ICL ARB register */ +#define ICL_UNC_ARB_PER_CTR 0x3b1 +#define ICL_UNC_ARB_PERFEVTSEL 0x3b3 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -313,15 +317,21 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_ops icl_uncore_msr_ops = { + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + static struct intel_uncore_type icl_uncore_cbox = { .name = "cbox", - .num_counters = 4, + .num_counters = 2, .perf_ctr_bits = 44, .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = ICL_UNC_CBO_MSR_OFFSET, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .format_group = &snb_uncore_format_group, }; @@ -350,13 +360,25 @@ static struct intel_uncore_type icl_uncore_clockbox = { .single_fixed = 1, .event_mask = SNB_UNC_CTL_EV_SEL_MASK, .format_group = &icl_uncore_clock_format_group, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .event_descs = icl_uncore_events, }; +static struct intel_uncore_type icl_uncore_arb = { + .name = "arb", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_ARB_PER_CTR, + .event_ctl = ICL_UNC_ARB_PERFEVTSEL, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .ops = &icl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *icl_msr_uncores[] = { &icl_uncore_cbox, - &snb_uncore_arb, + &icl_uncore_arb, &icl_uncore_clockbox, NULL, }; @@ -374,6 +396,21 @@ void icl_uncore_cpu_init(void) { uncore_msr_uncores = icl_msr_uncores; icl_uncore_cbox.num_boxes = icl_get_cbox_num(); +} + +static struct intel_uncore_type *tgl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +void tgl_uncore_cpu_init(void) +{ + uncore_msr_uncores = tgl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + icl_uncore_cbox.ops = &skl_uncore_msr_ops; + icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 62e88ad919ff..7bdb1821215d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -393,6 +393,11 @@ #define SNR_M2M_PCI_PMON_BOX_CTL 0x438 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL0 0x508 +#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0 + /* SNR IMC */ #define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 #define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 @@ -3749,7 +3754,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) ret = skx_iio_get_topology(type); if (ret) - return ret; + goto clear_attr_update; + + ret = -ENOMEM; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3781,8 +3788,9 @@ err: kfree(eas); kfree(attrs); kfree(type->topology); +clear_attr_update: type->attr_update = NULL; - return -ENOMEM; + return ret; } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) @@ -4551,12 +4559,46 @@ static struct intel_uncore_type snr_uncore_m2m = { .format_group = &snr_m2m_uncore_format_group, }; +static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | SNBEP_PMON_CTL_EN)); + pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32)); +} + +static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snr_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops = &snr_pcie3_uncore_pci_ops, + .format_group = &skx_uncore_iio_format_group, +}; + enum { SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, }; static struct intel_uncore_type *snr_pci_uncores[] = { [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, NULL, }; @@ -4573,6 +4615,19 @@ static struct pci_driver snr_uncore_pci_driver = { .id_table = snr_uncore_pci_ids, }; +static const struct pci_device_id snr_uncore_pci_sub_ids[] = { + { /* PCIe3 RP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_sub_driver = { + .name = "snr_uncore_sub", + .id_table = snr_uncore_pci_sub_ids, +}; + int snr_uncore_pci_init(void) { /* SNR UBOX DID */ @@ -4584,6 +4639,7 @@ int snr_uncore_pci_init(void) uncore_pci_uncores = snr_pci_uncores; uncore_pci_driver = &snr_uncore_pci_driver; + uncore_pci_sub_driver = &snr_uncore_pci_sub_driver; return 0; } @@ -4751,10 +4807,10 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -5212,17 +5268,17 @@ static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"), { /* end: all zeroes */ }, }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a949f6f55991..4be8f9cabd07 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7b68ab5f19e7..ee2b9b9fc2a5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -79,6 +79,31 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ + +static inline bool is_topdown_count(struct perf_event *event) +{ + return event->hw.flags & PERF_X86_EVENT_TOPDOWN; +} + +static inline bool is_metric_event(struct perf_event *event) +{ + u64 config = event->attr.config; + + return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) && + ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING) && + ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX); +} + +static inline bool is_slots_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS; +} + +static inline bool is_topdown_event(struct perf_event *event) +{ + return is_metric_event(event) || is_slots_event(event); +} struct amd_nb { int nb_id; /* NorthBridge id */ @@ -210,6 +235,8 @@ struct cpu_hw_events { they've never been enabled yet */ int n_txn; /* the # last events in the below arrays; added in the current transaction */ + int n_txn_pair; + int n_txn_metric; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; @@ -285,6 +312,12 @@ struct cpu_hw_events { u64 tfa_shadow; /* + * Perf Metrics + */ + /* number of accepted metrics events */ + int n_metric; + + /* * AMD specific bits */ struct amd_nb *amd_nb; @@ -376,6 +409,19 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) /* + * The special metric counters do not actually exist. They are calculated from + * the combination of the FxCtr3 + MSR_PERF_METRICS. + * + * The special metric counters are mapped to a dummy offset for the scheduler. + * The sharing between multiple users of the same metric without multiplexing + * is not allowed, even though the hardware supports that in principle. + */ + +#define METRIC_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)), \ + INTEL_ARCH_EVENT_MASK) + +/* * Constraint on the Event code + UMask */ #define INTEL_UEVENT_CONSTRAINT(c, n) \ @@ -537,7 +583,7 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; - u64 pebs_metrics_available:1; + u64 perf_metrics:1; u64 pebs_output_pt_available:1; }; u64 capabilities; @@ -727,6 +773,12 @@ struct x86_pmu { atomic_t lbr_exclusive[x86_lbr_exclusive_max]; /* + * Intel perf metrics + */ + u64 (*update_topdown_event)(struct perf_event *event); + int (*set_topdown_event_period)(struct perf_event *event); + + /* * perf task context (i.e. struct perf_event_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 67b411f7e8c4..7c0120e2e957 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -815,6 +815,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6035df1b49e1..e04d90af4c27 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -148,9 +148,9 @@ static inline bool hv_reenlightenment_available(void) * Check for required features and priviliges to make TSC frequency * change notifications work. */ - return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && - ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; + ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; } DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) @@ -330,8 +330,8 @@ void __init hyperv_init(void) return; /* Absolutely required MSRs */ - required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | - HV_X64_MSR_VP_INDEX_AVAILABLE; + required_msrs = HV_MSR_HYPERCALL_AVAILABLE | + HV_MSR_VP_INDEX_AVAILABLE; if ((ms_hyperv.features & required_msrs) != required_msrs) return; diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 07f21a06392f..f3270c1fc48c 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -66,7 +66,7 @@ void __init hv_init_spinlocks(void) { if (!hv_pvspin || !apic || !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) || - !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) { + !(ms_hyperv.features & HV_MSR_GUEST_IDLE_AVAILABLE)) { pr_info("PV spinlocks disabled\n"); return; } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index ca8a657edf59..a09fc37ead9d 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -239,7 +239,6 @@ beyond_if: (regs)->ss = __USER32_DS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - set_fs(USER_DS); return 0; } diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index ca0976456a6b..6d2df1ee427b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -159,8 +159,6 @@ static inline u64 x86_default_get_root_pointer(void) extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2cc44e957c31..4e3099d9ae62 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -374,12 +374,12 @@ extern struct apic *apic; #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ - __section(.apicdrivers) = { &sym } + __section(".apicdrivers") = { &sym } #define apic_drivers(sym1, sym2) \ static struct apic *__apicdrivers_##sym1##sym2[2] __used \ __aligned(sizeof(struct apic *)) \ - __section(.apicdrivers) = { &sym1, &sym2 } + __section(".apicdrivers") = { &sym1, &sym2 } extern struct apic *__apicdrivers[], *__apicdrivers_end[]; @@ -519,6 +519,14 @@ static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif +struct msi_msg; + +#ifdef CONFIG_PCI_MSI +void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg); +#else +# define x86_vector_msi_compose_msg NULL +#endif + extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 5a42f9206138..51e2bf27cc9b 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -5,6 +5,7 @@ #include <asm/string.h> #include <asm/page.h> #include <asm/checksum.h> +#include <asm/mce.h> #include <asm-generic/asm-prototypes.h> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 5c15f95b1ba7..0603c7423aca 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -135,14 +135,21 @@ # define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) -# define _ASM_NOKPROBE(entry) \ +# ifdef CONFIG_KPROBES +# define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ _ASM_PTR (entry); \ .popsection +# else +# define _ASM_NOKPROBE(entry) +# endif #else /* ! __ASSEMBLY__ */ # define _EXPAND_EXTABLE_HANDLE(x) #x @@ -160,6 +167,9 @@ # define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index abe08690a887..69404eae9983 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -8,7 +8,7 @@ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) -#define __read_mostly __attribute__((__section__(".data..read_mostly"))) +#define __read_mostly __section(".data..read_mostly") #define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT #define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index 0ada98d5d09f..bca625a60186 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 #define HAVE_CSUM_COPY_USER +#define _HAVE_ARCH_CSUM_AND_COPY #ifdef CONFIG_X86_32 # include <asm/checksum_32.h> #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 11624c8a9d8d..17da95387997 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -27,9 +27,7 @@ asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum); * better 64-bit) boundary */ -asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, - int len, __wsum sum, - int *src_err_ptr, int *dst_err_ptr); +asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, int len); /* * Note: when you get a NULL pointer exception here this means someone @@ -38,26 +36,20 @@ asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, * If you use these functions directly please don't forget the * access_ok(). */ -static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, - int len, __wsum sum) +static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { - return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); + return csum_partial_copy_generic(src, dst, len); } static inline __wsum csum_and_copy_from_user(const void __user *src, - void *dst, int len, - __wsum sum, int *err_ptr) + void *dst, int len) { __wsum ret; might_sleep(); - if (!user_access_begin(src, len)) { - if (len) - *err_ptr = -EFAULT; - return sum; - } - ret = csum_partial_copy_generic((__force void *)src, dst, - len, sum, err_ptr, NULL); + if (!user_access_begin(src, len)) + return 0; + ret = csum_partial_copy_generic((__force void *)src, dst, len); user_access_end(); return ret; @@ -178,23 +170,17 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, */ static inline __wsum csum_and_copy_to_user(const void *src, void __user *dst, - int len, __wsum sum, - int *err_ptr) + int len) { __wsum ret; might_sleep(); - if (user_access_begin(dst, len)) { - ret = csum_partial_copy_generic(src, (__force void *)dst, - len, sum, NULL, err_ptr); - user_access_end(); - return ret; - } + if (!user_access_begin(dst, len)) + return 0; - if (len) - *err_ptr = -EFAULT; - - return (__force __wsum)-1; /* invalid checksum */ + ret = csum_partial_copy_generic(src, (__force void *)dst, len); + user_access_end(); + return ret; } #endif /* _ASM_X86_CHECKSUM_32_H */ diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 0a289b87e872..407beebadaf4 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -130,17 +130,11 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, - int len, __wsum sum, - int *src_err_ptr, int *dst_err_ptr); +extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); - -extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, - int len, __wsum isum, int *errp); -extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, - int len, __wsum isum, int *errp); -extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, - int len, __wsum sum); +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len); +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len); +extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len); /** * ip_compute_csum - Compute an 16bit IP checksum. diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index d4edf281fff4..0e327a01f50f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -27,8 +27,6 @@ typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; -typedef s64 __attribute__((aligned(4))) compat_s64; -typedef u64 __attribute__((aligned(4))) compat_u64; struct compat_stat { compat_dev_t st_dev; @@ -211,6 +209,7 @@ static inline bool in_compat_syscall(void) return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ +#define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; diff --git a/arch/x86/include/asm/copy_mc_test.h b/arch/x86/include/asm/copy_mc_test.h new file mode 100644 index 000000000000..e4991ba96726 --- /dev/null +++ b/arch/x86/include/asm/copy_mc_test.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COPY_MC_TEST_H_ +#define _COPY_MC_TEST_H_ + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_COPY_MC_TEST +extern unsigned long copy_mc_test_src; +extern unsigned long copy_mc_test_dst; + +static inline void copy_mc_inject_src(void *addr) +{ + if (addr) + copy_mc_test_src = (unsigned long) addr; + else + copy_mc_test_src = ~0UL; +} + +static inline void copy_mc_inject_dst(void *addr) +{ + if (addr) + copy_mc_test_dst = (unsigned long) addr; + else + copy_mc_test_dst = ~0UL; +} +#else /* CONFIG_COPY_MC_TEST */ +static inline void copy_mc_inject_src(void *addr) +{ +} + +static inline void copy_mc_inject_dst(void *addr) +{ +} +#endif /* CONFIG_COPY_MC_TEST */ + +#else /* __ASSEMBLY__ */ +#include <asm/export.h> + +#ifdef CONFIG_COPY_MC_TEST +.macro COPY_MC_TEST_CTL + .pushsection .data + .align 8 + .globl copy_mc_test_src + copy_mc_test_src: + .quad 0 + EXPORT_SYMBOL_GPL(copy_mc_test_src) + .globl copy_mc_test_dst + copy_mc_test_dst: + .quad 0 + EXPORT_SYMBOL_GPL(copy_mc_test_dst) + .popsection +.endm + +.macro COPY_MC_TEST_SRC reg count target + leaq \count(\reg), %r9 + cmp copy_mc_test_src, %r9 + ja \target +.endm + +.macro COPY_MC_TEST_DST reg count target + leaq \count(\reg), %r9 + cmp copy_mc_test_dst, %r9 + ja \target +.endm +#else +.macro COPY_MC_TEST_CTL +.endm + +.macro COPY_MC_TEST_SRC reg count target +.endm + +.macro COPY_MC_TEST_DST reg count target +.endm +#endif /* CONFIG_COPY_MC_TEST */ +#endif /* __ASSEMBLY__ */ +#endif /* _COPY_MC_TEST_H_ */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 8902fdb7de13..3d52b094850a 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -11,25 +11,29 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize) \ - char DF_stack_guard[guardsize]; \ - char DF_stack[EXCEPTION_STKSZ]; \ - char NMI_stack_guard[guardsize]; \ - char NMI_stack[EXCEPTION_STKSZ]; \ - char DB_stack_guard[guardsize]; \ - char DB_stack[EXCEPTION_STKSZ]; \ - char MCE_stack_guard[guardsize]; \ - char MCE_stack[EXCEPTION_STKSZ]; \ - char IST_top_guard[guardsize]; \ +#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[EXCEPTION_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char VC_stack_guard[guardsize]; \ + char VC_stack[optional_stack_size]; \ + char VC2_stack_guard[guardsize]; \ + char VC2_stack[optional_stack_size]; \ + char IST_top_guard[guardsize]; \ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(0, 0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE) + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) }; /* @@ -40,6 +44,8 @@ enum exception_stack_ordering { ESTACK_NMI, ESTACK_DB, ESTACK_MCE, + ESTACK_VC, + ESTACK_VC2, N_EXCEPTION_STACKS }; @@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) #define __this_cpu_ist_top_va(name) \ CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) +#define __this_cpu_ist_bottom_va(name) \ + CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2901d5df4366..dad350d42ecf 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* free ( 3*32+17) */ +#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -236,6 +236,7 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -288,6 +289,7 @@ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ @@ -353,6 +355,7 @@ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ @@ -368,6 +371,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index e89558a3fe4a..cfdf307ddc01 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -90,8 +90,6 @@ static __always_inline bool hw_breakpoint_active(void) return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } -extern void aout_dump_debugregs(struct user *dump); - extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1ced11d31932..476082a83d1c 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) void alloc_intr_gate(unsigned int n, const void *addr); +static inline void init_idt_data(struct idt_data *data, unsigned int n, + const void *addr) +{ + BUG_ON(n > 0xFF); + + memset(data, 0, sizeof(*data)); + data->vector = n; + data->addr = addr; + data->segment = __KERNEL_CS; + data->bits.type = GATE_INTERRUPT; + data->bits.p = 1; +} + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + extern unsigned long system_vectors[]; extern void load_current_idt(void); diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a91f3b6e4f2a..f7e7099af595 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -74,6 +74,13 @@ struct idt_bits { p : 1; } __attribute__((packed)); +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + struct gate_struct { u16 offset_low; u16 segment; @@ -109,6 +116,9 @@ struct desc_ptr { #endif /* !__ASSEMBLY__ */ +/* Boot IDT definitions */ +#define BOOT_IDT_ENTRIES 32 + /* Access rights as returned by LAR */ #define AR_TYPE_RODATA (0 * (1 << 9)) #define AR_TYPE_RWDATA (1 * (1 << 9)) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 4ea8584682f9..5861d34f9771 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -56,6 +56,12 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif +#ifdef CONFIG_IOMMU_SUPPORT +# define DISABLE_ENQCMD 0 +#else +# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -75,7 +81,8 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ + DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index fed67eafcacc..bb1654fe0ce7 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -8,10 +8,8 @@ */ #include <linux/scatterlist.h> -#include <linux/dma-debug.h> #include <asm/io.h> #include <asm/swiotlb.h> -#include <linux/dma-contiguous.h> extern int iommu_merge; extern int panic_on_overflow; diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index d8c2198d543b..1f0cbc52937c 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -29,10 +29,17 @@ struct pt_regs; (b)->handler = (tmp).handler - (delta); \ } while (0) +enum handler_type { + EX_HANDLER_NONE, + EX_HANDLER_FAULT, + EX_HANDLER_UACCESS, + EX_HANDLER_OTHER +}; + extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); -extern bool ex_has_fault_handler(unsigned long ip); +extern enum handler_type ex_get_fault_handler_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); #endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 0f0dd645b594..77217bd292bd 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -99,7 +99,7 @@ enum fixed_addresses { FIX_PCIE_MCFG, #endif #endif -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL FIX_PARAVIRT_BOOTMAP, #endif #ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b774c52e5411..dcd9503b1098 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -62,4 +62,16 @@ extern void switch_fpu_return(void); */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); +/* + * Tasks that are not using SVA have mm->pasid set to zero to note that they + * will not have the valid bit set in MSR_IA32_PASID while they are running. + */ +#define PASID_DISABLED 0 + +#ifdef CONFIG_IOMMU_SUPPORT +/* Update current's PASID MSR/state by mm's PASID. */ +void update_pasid(void); +#else +static inline void update_pasid(void) { } +#endif #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0a460f2a3f90..8d33ad80704f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -19,6 +19,7 @@ #include <asm/user.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> +#include <asm/fpu/xcr.h> #include <asm/cpufeature.h> #include <asm/trace/fpu.h> @@ -583,38 +584,13 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) pkru_val = pk->pkru; } __write_pkru(pkru_val); -} - -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ - asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); + /* + * Expensive PASID MSR write will be avoided in update_pasid() because + * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated + * unless it's different from mm->pasid to reduce overhead. + */ + update_pasid(); } #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index c87364ea6446..f5a38a5f3ae1 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -114,7 +114,7 @@ enum xfeature { XFEATURE_Hi16_ZMM, XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, - XFEATURE_RSRVD_COMP_10, + XFEATURE_PASID, XFEATURE_RSRVD_COMP_11, XFEATURE_RSRVD_COMP_12, XFEATURE_RSRVD_COMP_13, @@ -134,6 +134,7 @@ enum xfeature { #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) @@ -256,6 +257,14 @@ struct arch_lbr_state { struct lbr_entry entries[]; } __packed; +/* + * State component 10 is supervisor state used for context-switching the + * PASID state. + */ +struct ia32_pasid_state { + u64 pasid; +} __packed; + struct xstate_header { u64 xfeatures; u64 xcomp_bv; diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h new file mode 100644 index 000000000000..1c7ab8d95da5 --- /dev/null +++ b/arch/x86/include/asm/fpu/xcr.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_XCR_H +#define _ASM_X86_FPU_XCR_H + +/* + * MXCSR and XCR definitions: + */ + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); +} + +#endif /* _ASM_X86_FPU_XCR_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 14ab815132d4..47a92232d595 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -35,7 +35,7 @@ XFEATURE_MASK_BNDCSR) /* All currently supported supervisor features */ -#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) /* * A supervisor state component may not always contain valuable information, diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 296b346184b2..fb42659f6e98 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -60,12 +60,26 @@ #define FRAME_END "pop %" _ASM_BP "\n" #ifdef CONFIG_X86_64 + #define ENCODE_FRAME_POINTER \ "lea 1(%rsp), %rbp\n\t" + +static inline unsigned long encode_frame_pointer(struct pt_regs *regs) +{ + return (unsigned long)regs + 1; +} + #else /* !CONFIG_X86_64 */ + #define ENCODE_FRAME_POINTER \ "movl %esp, %ebp\n\t" \ "andl $0x7fffffff, %ebp\n\t" + +static inline unsigned long encode_frame_pointer(struct pt_regs *regs) +{ + return (unsigned long)regs & 0x7fffffff; +} + #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ @@ -83,6 +97,11 @@ #define ENCODE_FRAME_POINTER +static inline unsigned long encode_frame_pointer(struct pt_regs *regs) +{ + return 0; +} + #endif #define FRAME_BEGIN diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index d552646411a9..35cff5f2becf 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -57,7 +57,7 @@ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - if (static_cpu_has(X86_FEATURE_FSGSBASE)) + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) fsbase = rdfsbase(); else rdmsrl(MSR_FS_BASE, fsbase); @@ -67,7 +67,7 @@ static inline unsigned long x86_fsbase_read_cpu(void) static inline void x86_fsbase_write_cpu(unsigned long fsbase) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) wrfsbase(fsbase); else wrmsrl(MSR_FS_BASE, fsbase); diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 74c12437401e..a4aeeaace040 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -36,61 +36,56 @@ struct msi_desc; enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_IOAPIC = 1, X86_IRQ_ALLOC_TYPE_HPET, - X86_IRQ_ALLOC_TYPE_MSI, - X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_PCI_MSI, + X86_IRQ_ALLOC_TYPE_PCI_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, X86_IRQ_ALLOC_TYPE_UV, + X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT, + X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT, }; +struct ioapic_alloc_info { + int pin; + int node; + u32 trigger : 1; + u32 polarity : 1; + u32 valid : 1; + struct IO_APIC_route_entry *entry; +}; + +struct uv_alloc_info { + int limit; + int blade; + unsigned long offset; + char *name; + +}; + +/** + * irq_alloc_info - X86 specific interrupt allocation info + * @type: X86 specific allocation type + * @flags: Flags for allocation tweaks + * @devid: Device ID for allocations + * @hwirq: Associated hw interrupt number in the domain + * @mask: CPU mask for vector allocation + * @desc: Pointer to msi descriptor + * @data: Allocation specific data + * + * @ioapic: IOAPIC specific allocation data + * @uv: UV specific allocation data +*/ struct irq_alloc_info { enum irq_alloc_type type; u32 flags; - const struct cpumask *mask; /* CPU mask for vector allocation */ + u32 devid; + irq_hw_number_t hwirq; + const struct cpumask *mask; + struct msi_desc *desc; + void *data; + union { - int unused; -#ifdef CONFIG_HPET_TIMER - struct { - int hpet_id; - int hpet_index; - void *hpet_data; - }; -#endif -#ifdef CONFIG_PCI_MSI - struct { - struct pci_dev *msi_dev; - irq_hw_number_t msi_hwirq; - }; -#endif -#ifdef CONFIG_X86_IO_APIC - struct { - int ioapic_id; - int ioapic_pin; - int ioapic_node; - u32 ioapic_trigger : 1; - u32 ioapic_polarity : 1; - u32 ioapic_valid : 1; - struct IO_APIC_route_entry *ioapic_entry; - }; -#endif -#ifdef CONFIG_DMAR_TABLE - struct { - int dmar_id; - void *dmar_data; - }; -#endif -#ifdef CONFIG_X86_UV - struct { - int uv_limit; - int uv_blade; - unsigned long uv_offset; - char *uv_name; - }; -#endif -#if IS_ENABLED(CONFIG_VMD) - struct { - struct msi_desc *desc; - }; -#endif + struct ioapic_alloc_info ioapic; + struct uv_alloc_info uv; }; }; diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 7a4d2062385c..0ed20e8bba9e 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -28,39 +28,6 @@ #define HYPERV_CPUID_MAX 0x4000ffff /* - * Aliases for Group A features that have X64 in the name. - * On x86/x64 these are HYPERV_CPUID_FEATURES.EAX bits. - */ - -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE \ - HV_MSR_VP_RUNTIME_AVAILABLE -#define HV_X64_MSR_SYNIC_AVAILABLE \ - HV_MSR_SYNIC_AVAILABLE -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE \ - HV_MSR_APIC_ACCESS_AVAILABLE -#define HV_X64_MSR_HYPERCALL_AVAILABLE \ - HV_MSR_HYPERCALL_AVAILABLE -#define HV_X64_MSR_VP_INDEX_AVAILABLE \ - HV_MSR_VP_INDEX_AVAILABLE -#define HV_X64_MSR_RESET_AVAILABLE \ - HV_MSR_RESET_AVAILABLE -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE \ - HV_MSR_GUEST_IDLE_AVAILABLE -#define HV_X64_ACCESS_FREQUENCY_MSRS \ - HV_ACCESS_FREQUENCY_MSRS -#define HV_X64_ACCESS_REENLIGHTENMENT \ - HV_ACCESS_REENLIGHTENMENT -#define HV_X64_ACCESS_TSC_INVARIANT \ - HV_ACCESS_TSC_INVARIANT - -/* - * Aliases for Group B features that have X64 in the name. - * On x86/x64 these are HYPERV_CPUID_FEATURES.EBX bits. - */ -#define HV_X64_POST_MESSAGES HV_POST_MESSAGES -#define HV_X64_SIGNAL_EVENTS HV_SIGNAL_EVENTS - -/* * Group D Features. The bit assignments are custom to each architecture. * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index a43366191212..b2442eb0ac2f 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -242,7 +242,7 @@ __visible noinstr void func(struct pt_regs *regs) \ instrumentation_begin(); \ irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ - run_on_irqstack_cond(__##func, regs, regs); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ @@ -309,6 +309,19 @@ static __always_inline void __##func(struct pt_regs *regs) __visible void noist_##func(struct pt_regs *regs) /** + * DECLARE_IDTENTRY_VC - Declare functions for the VC entry point + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the + * safe_stack C handler. + */ +#define DECLARE_IDTENTRY_VC(vector, func) \ + DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \ + __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \ + __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code) + +/** * DEFINE_IDTENTRY_IST - Emit code for IST entry points * @func: Function name of the entry point * @@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_DF(func) \ DEFINE_IDTENTRY_RAW_ERRORCODE(func) +/** + * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler + which runs on a safe stack. + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func) + +/** + * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler + which runs on the VC fall-back stack + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_IST(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func) + +/** + * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(func) + #else /* CONFIG_X86_64 */ /** @@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_XENCB(vector, func) \ DECLARE_IDTENTRY(vector, func) +# define DECLARE_IDTENTRY_VC(vector, func) \ + idtentry_vc vector asm_##func func + #else # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) @@ -547,7 +592,7 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) +#ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); #endif @@ -557,13 +602,18 @@ DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); #else DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); #endif -#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) +#ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); #endif /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +/* #VC */ +#ifdef CONFIG_AMD_MEM_ENCRYPT +DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication); +#endif + #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); #endif @@ -591,10 +641,6 @@ DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); #endif #ifdef CONFIG_X86_LOCAL_APIC -# ifdef CONFIG_X86_UV -DECLARE_IDTENTRY_SYSVEC(UV_BAU_MESSAGE, sysvec_uv_bau_message); -# endif - # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); # endif diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 2b6ccf2c49f1..a0f839aa144d 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -15,9 +15,15 @@ #define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) #define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) +bool insn_has_rep_prefix(struct insn *insn); void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); +int insn_fetch_from_user(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); +bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index de58391bdee0..cf0e25f45422 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -43,7 +43,7 @@ struct devs_id { #define sfi_device(i) \ static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \ - __attribute__((__section__(".x86_intel_mid_dev.init"))) = &i + __section(".x86_intel_mid_dev.init") = &i /** * struct mid_sd_board_info - template for SD device creation diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e1aa17a468a8..d726459d08e5 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -401,7 +401,7 @@ extern bool phys_mem_access_encrypted(unsigned long phys_addr, /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units - * @__dst: destination, in MMIO space (must be 512-bit aligned) + * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: source * @count: number of 512 bits quantities to submit * @@ -412,25 +412,14 @@ extern bool phys_mem_access_encrypted(unsigned long phys_addr, * Warning: Do not use this helper unless your driver has checked that the CPU * instruction is supported on the platform. */ -static inline void iosubmit_cmds512(void __iomem *__dst, const void *src, +static inline void iosubmit_cmds512(void __iomem *dst, const void *src, size_t count) { - /* - * Note that this isn't an "on-stack copy", just definition of "dst" - * as a pointer to 64-bytes of stuff that is going to be overwritten. - * In the MOVDIR64B case that may be needed as you can use the - * MOVDIR64B instruction to copy arbitrary memory around. This trick - * lets the compiler know how much gets clobbered. - */ - volatile struct { char _[64]; } *dst = __dst; const u8 *from = src; const u8 *end = from + count * 64; while (from < end) { - /* MOVDIR64B [rdx], rax */ - asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" - : "=m" (dst) - : "d" (from), "a" (dst)); + movdir64b(dst, from); from += 64; } } diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 4bc985f1e2e4..af4a151d70b3 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -45,8 +45,6 @@ extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); extern struct irq_domain * -irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); -extern struct irq_domain * irq_remapping_get_irq_domain(struct irq_alloc_info *info); /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ @@ -74,12 +72,6 @@ static inline void panic_if_irq_remap(const char *msg) } static inline struct irq_domain * -irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) -{ - return NULL; -} - -static inline struct irq_domain * irq_remapping_get_irq_domain(struct irq_alloc_info *info) { return NULL; diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 4ae66f097101..775816965c6a 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -12,20 +12,50 @@ static __always_inline bool irqstack_active(void) return __this_cpu_read(irq_count) != -1; } -void asm_call_on_stack(void *sp, void *func, void *arg); +void asm_call_on_stack(void *sp, void (*func)(void), void *arg); +void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs), + struct pt_regs *regs); +void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc), + struct irq_desc *desc); -static __always_inline void __run_on_irqstack(void *func, void *arg) +static __always_inline void __run_on_irqstack(void (*func)(void)) { void *tos = __this_cpu_read(hardirq_stack_ptr); __this_cpu_add(irq_count, 1); - asm_call_on_stack(tos - 8, func, arg); + asm_call_on_stack(tos - 8, func, NULL); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_sysvec_on_stack(tos - 8, func, regs); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_irq_on_stack(tos - 8, func, desc); __this_cpu_sub(irq_count, 1); } #else /* CONFIG_X86_64 */ static inline bool irqstack_active(void) { return false; } -static inline void __run_on_irqstack(void *func, void *arg) { } +static inline void __run_on_irqstack(void (*func)(void)) { } +static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) { } +static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) { } #endif /* !CONFIG_X86_64 */ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) @@ -37,17 +67,40 @@ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) return !user_mode(regs) && !irqstack_active(); } -static __always_inline void run_on_irqstack_cond(void *func, void *arg, + +static __always_inline void run_on_irqstack_cond(void (*func)(void), struct pt_regs *regs) { - void (*__func)(void *arg) = func; + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_on_irqstack(func); + else + func(); +} + +static __always_inline void +run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + if (irq_needs_irq_stack(regs)) + __run_sysvec_on_irqstack(func, regs); + else + func(regs); +} + +static __always_inline void +run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc, + struct pt_regs *regs) +{ lockdep_assert_irqs_disabled(); if (irq_needs_irq_stack(regs)) - __run_on_irqstack(__func, arg); + __run_irq_on_irqstack(func, desc); else - __func(arg); + func(desc); } #endif diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index c066ffae222b..cd684d45cb5f 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -51,9 +51,13 @@ extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); #endif /* CONFIG_X86_IO_APIC */ #ifdef CONFIG_PCI_MSI -extern void arch_init_msi_domain(struct irq_domain *domain); +void x86_create_pci_msi_domain(void); +struct irq_domain *native_create_pci_msi_domain(void); +extern struct irq_domain *x86_pci_msi_default_domain; #else -static inline void arch_init_msi_domain(struct irq_domain *domain) { } +static inline void x86_create_pci_msi_domain(void) { } +#define native_create_pci_msi_domain NULL +#define x86_pci_msi_default_domain NULL #endif #endif diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 02a0cf547d7b..2dfc8d380dab 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -9,7 +9,7 @@ #include <asm/nospec-branch.h> /* Provide __cpuidle; we can't safely include <linux/cpu.h> */ -#define __cpuidle __attribute__((__section__(".cpuidle.text"))) +#define __cpuidle __section(".cpuidle.text") /* * Interrupt control: diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 143bc9abe99c..991a7ad540c7 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -106,5 +106,9 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_int3_handler(struct pt_regs *regs); extern int kprobe_debug_handler(struct pt_regs *regs); +#else + +static inline int kprobe_debug_handler(struct pt_regs *regs) { return 0; } + #endif /* CONFIG_KPROBES */ #endif /* _ASM_X86_KPROBES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5303dbc5c9bc..d44858b69353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -80,13 +80,14 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) -#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -132,7 +133,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 @@ -636,7 +637,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry2 *cpuid_entries; int maxphyaddr; int max_tdp_level; @@ -788,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { @@ -860,6 +876,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -961,8 +984,31 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; + + /* + * Whether the TDP MMU is enabled for this VM. This contains a + * snapshot of the TDP MMU module parameter from when the VM was + * created and remains unchanged for the life of the VM. If this is + * true, TDP MMU handler functions will run for various MMU + * operations. + */ + bool tdp_mmu_enabled; + + /* List of struct tdp_mmu_pages being used as roots */ + struct list_head tdp_mmu_roots; + /* List of struct tdp_mmu_pages not being used as roots */ + struct list_head tdp_mmu_pages; }; struct kvm_vm_stat { @@ -1069,7 +1115,7 @@ struct kvm_x86_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1143,7 +1189,12 @@ struct kvm_x86_ops { /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + /* + * Retrieve somewhat arbitrary exit information. Intended to be used + * only from within tracepoints to avoid VMREADs when tracing is off. + */ + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, @@ -1221,12 +1272,13 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); + void (*msr_filter_changed)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { @@ -1238,7 +1290,7 @@ struct kvm_x86_nested_ops { int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, @@ -1612,8 +1664,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_shared_msr(unsigned index, u32 msr); -int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cf503824529c..a0f147893a04 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -136,9 +136,24 @@ #define MCE_HANDLED_NFIT BIT_ULL(3) #define MCE_HANDLED_EDAC BIT_ULL(4) #define MCE_HANDLED_MCELOG BIT_ULL(5) + +/* + * Indicates an MCE which has happened in kernel space but from + * which the kernel can recover simply by executing fixup_exception() + * so that an error is returned to the caller of the function that + * hit the machine check. + */ #define MCE_IN_KERNEL_RECOV BIT_ULL(6) /* + * Indicates an MCE that happened in kernel space while copying data + * from user. In this case fixup_exception() gets the kernel to the + * error exit for the copy function. Machine check handler can then + * treat it like a fault taken in user mode. + */ +#define MCE_IN_KERNEL_COPYIN BIT_ULL(7) + +/* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external * debugging tools. Each entry is only valid when its finished flag @@ -174,6 +189,15 @@ extern void mce_unregister_decode_chain(struct notifier_block *nb); extern int mce_p5_enabled; +#ifdef CONFIG_ARCH_HAS_COPY_MC +extern void enable_copy_mc_fragile(void); +unsigned long __must_check copy_mc_fragile(void *dst, const void *src, unsigned cnt); +#else +static inline void enable_copy_mc_fragile(void) +{ +} +#endif + #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); @@ -200,12 +224,8 @@ void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); -/* - * Maximum banks number. - * This is the limit of the current register layout on - * Intel CPUs. - */ -#define MAX_NR_BANKS 32 +/* Maximum number of MCA banks per CPU. */ +#define MAX_NR_BANKS 64 #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); @@ -328,7 +348,6 @@ enum smca_bank_types { struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ - u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ u8 count; /* Number of instances. */ }; diff --git a/arch/x86/include/asm/mcsafe_test.h b/arch/x86/include/asm/mcsafe_test.h deleted file mode 100644 index eb59804b6201..000000000000 --- a/arch/x86/include/asm/mcsafe_test.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MCSAFE_TEST_H_ -#define _MCSAFE_TEST_H_ - -#ifndef __ASSEMBLY__ -#ifdef CONFIG_MCSAFE_TEST -extern unsigned long mcsafe_test_src; -extern unsigned long mcsafe_test_dst; - -static inline void mcsafe_inject_src(void *addr) -{ - if (addr) - mcsafe_test_src = (unsigned long) addr; - else - mcsafe_test_src = ~0UL; -} - -static inline void mcsafe_inject_dst(void *addr) -{ - if (addr) - mcsafe_test_dst = (unsigned long) addr; - else - mcsafe_test_dst = ~0UL; -} -#else /* CONFIG_MCSAFE_TEST */ -static inline void mcsafe_inject_src(void *addr) -{ -} - -static inline void mcsafe_inject_dst(void *addr) -{ -} -#endif /* CONFIG_MCSAFE_TEST */ - -#else /* __ASSEMBLY__ */ -#include <asm/export.h> - -#ifdef CONFIG_MCSAFE_TEST -.macro MCSAFE_TEST_CTL - .pushsection .data - .align 8 - .globl mcsafe_test_src - mcsafe_test_src: - .quad 0 - EXPORT_SYMBOL_GPL(mcsafe_test_src) - .globl mcsafe_test_dst - mcsafe_test_dst: - .quad 0 - EXPORT_SYMBOL_GPL(mcsafe_test_dst) - .popsection -.endm - -.macro MCSAFE_TEST_SRC reg count target - leaq \count(\reg), %r9 - cmp mcsafe_test_src, %r9 - ja \target -.endm - -.macro MCSAFE_TEST_DST reg count target - leaq \count(\reg), %r9 - cmp mcsafe_test_dst, %r9 - ja \target -.endm -#else -.macro MCSAFE_TEST_CTL -.endm - -.macro MCSAFE_TEST_SRC reg count target -.endm - -.macro MCSAFE_TEST_DST reg count target -.endm -#endif /* CONFIG_MCSAFE_TEST */ -#endif /* __ASSEMBLY__ */ -#endif /* _MCSAFE_TEST_H_ */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 5049f6c22683..2f62bbdd9d12 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,6 +19,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; +extern u64 sev_status; extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, @@ -48,10 +49,12 @@ void __init mem_encrypt_free_decrypted_mem(void); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init sev_es_init_vc_handling(void); bool sme_active(void); bool sev_active(void); +bool sev_es_active(void); -#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) +#define __bss_decrypted __section(".bss..decrypted") #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sev_es_init_vc_handling(void) { } static inline bool sme_active(void) { return false; } static inline bool sev_active(void) { return false; } +static inline bool sev_es_active(void) { return false; } static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 606cbaebd336..e90ac7e9ae2c 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -67,21 +67,11 @@ static inline void find_smp_config(void) #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern int default_mpc_apic_id(struct mpc_cpu *m); -extern void default_smp_read_mpc_oem(struct mpc_table *mpc); -# ifdef CONFIG_X86_IO_APIC -extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); -# else -# define default_mpc_oem_bus_info NULL -# endif extern void default_find_smp_config(void); extern void default_get_smp_config(unsigned int early); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } #define enable_update_mptable 0 -#define default_mpc_apic_id NULL -#define default_smp_read_mpc_oem NULL -#define default_mpc_oem_bus_info NULL #define default_find_smp_config x86_init_noop #define default_get_smp_config x86_init_uint_noop #endif diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4f77b8f22e54..ffc289992d1b 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -54,6 +54,7 @@ typedef int (*hyperv_fill_flush_list_func)( #define hv_enable_vdso_clocksource() \ vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() +#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR /* * Reference to pv_ops must be inline so objtool diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index 25ddd0916bb2..cd30013d15d3 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -9,6 +9,4 @@ typedef struct irq_alloc_info msi_alloc_info_t; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); -void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc); - #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2859ee4f39a8..972a34d93505 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -257,6 +257,9 @@ #define MSR_IA32_LASTINTFROMIP 0x000001dd #define MSR_IA32_LASTINTTOIP 0x000001de +#define MSR_IA32_PASID 0x00000d93 +#define MSR_IA32_PASID_VALID BIT_ULL(31) + /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_BTF_SHIFT 1 @@ -464,11 +467,15 @@ #define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f @@ -857,11 +864,14 @@ #define MSR_CORE_PERF_FIXED_CTR0 0x00000309 #define MSR_CORE_PERF_FIXED_CTR1 0x0000030a #define MSR_CORE_PERF_FIXED_CTR2 0x0000030b +#define MSR_CORE_PERF_FIXED_CTR3 0x0000030c #define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_PERF_METRICS 0x00000329 + /* PERF_GLOBAL_OVF_CTL bits */ #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 86f20d520a07..0b4920a7238e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -60,22 +60,20 @@ struct saved_msrs { #define EAX_EDX_RET(val, low, high) "=A" (val) #endif -#ifdef CONFIG_TRACEPOINTS /* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> #include <linux/tracepoint-defs.h> -extern struct tracepoint __tracepoint_read_msr; -extern struct tracepoint __tracepoint_write_msr; -extern struct tracepoint __tracepoint_rdpmc; -#define msr_tracepoint_active(t) static_key_false(&(t).key) +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(read_msr); +DECLARE_TRACEPOINT(write_msr); +DECLARE_TRACEPOINT(rdpmc); extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else -#define msr_tracepoint_active(t) false static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} @@ -128,7 +126,7 @@ static inline unsigned long long native_read_msr(unsigned int msr) val = __rdmsr(msr); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, val, 0); return val; @@ -150,7 +148,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr), [fault] "i" (-EIO)); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); } @@ -161,7 +159,7 @@ native_write_msr(unsigned int msr, u32 low, u32 high) { __wrmsr(msr, low, high); - if (msr_tracepoint_active(__tracepoint_write_msr)) + if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } @@ -181,7 +179,7 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high) : "c" (msr), "0" (low), "d" (high), [fault] "i" (-EIO) : "memory"); - if (msr_tracepoint_active(__tracepoint_write_msr)) + if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } @@ -248,7 +246,7 @@ static inline unsigned long long native_read_pmc(int counter) DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); - if (msr_tracepoint_active(__tracepoint_rdpmc)) + if (tracepoint_enabled(rdpmc)) do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e7752b4038ff..cb9ad6b73973 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,7 +4,7 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include <linux/static_key.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <asm/alternative.h> #include <asm/alternative-asm.h> @@ -314,19 +314,19 @@ static inline void mds_idle_clear_cpu_buffers(void) * lfence * jmp spec_trap * do_rop: - * mov %rax,(%rsp) for x86_64 + * mov %rcx,(%rsp) for x86_64 * mov %edx,(%esp) for x86_32 * retq * * Without retpolines configured: * - * jmp *%rax for x86_64 + * jmp *%rcx for x86_64 * jmp *%edx for x86_32 */ #ifdef CONFIG_RETPOLINE # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 17 -# define RETPOLINE_RAX_BPF_JIT() \ +# define RETPOLINE_RCX_BPF_JIT_SIZE 17 +# define RETPOLINE_RCX_BPF_JIT() \ do { \ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ /* spec_trap: */ \ @@ -334,7 +334,7 @@ do { \ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ EMIT1(0xC3); /* retq */ \ } while (0) # else /* !CONFIG_X86_64 */ @@ -352,9 +352,9 @@ do { \ # endif #else /* !CONFIG_RETPOLINE */ # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 2 -# define RETPOLINE_RAX_BPF_JIT() \ - EMIT2(0xFF, 0xE0); /* jmp *%rax */ +# define RETPOLINE_RCX_BPF_JIT_SIZE 2 +# define RETPOLINE_RCX_BPF_JIT() \ + EMIT2(0xFF, 0xE1); /* jmp *%rcx */ # else /* !CONFIG_X86_64 */ # define RETPOLINE_EDX_BPF_JIT() \ EMIT2(0xFF, 0xE2) /* jmp *%edx */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bbfde3d2662f..e3bae2b60a0d 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -3,6 +3,7 @@ #define _ASM_X86_NUMA_H #include <linux/nodemask.h> +#include <linux/errno.h> #include <asm/topology.h> #include <asm/apicdef.h> @@ -62,12 +63,14 @@ extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void numa_add_cpu(int cpu); extern void numa_remove_cpu(int cpu); +extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } +static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS @@ -77,7 +80,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); +int numa_emu_cmdline(char *str); +#else /* CONFIG_NUMA_EMU */ +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} #endif /* CONFIG_NUMA_EMU */ #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index d25534940bde..fdbffec4cfde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -39,27 +39,6 @@ #define ORC_REG_SP_INDIRECT 9 #define ORC_REG_MAX 15 -/* - * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the - * caller's SP right before it made the call). Used for all callable - * functions, i.e. all C code and all callable asm functions. - * - * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points - * to a fully populated pt_regs from a syscall, interrupt, or exception. - * - * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset - * points to the iret return frame. - * - * The UNWIND_HINT macros are used only for the unwind_hint struct. They - * aren't used in struct orc_entry due to size and complexity constraints. - * Objtool converts them to real types when it converts the hints to orc - * entries. - */ -#define ORC_TYPE_CALL 0 -#define ORC_TYPE_REGS 1 -#define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_RET_OFFSET 3 - #ifndef __ASSEMBLY__ /* * This struct is more or less a vastly simplified version of the DWARF Call @@ -78,19 +57,6 @@ struct orc_entry { unsigned end:1; } __packed; -/* - * This struct is used by asm and inline asm code to manually annotate the - * location of registers on the stack for the ORC unwinder. - * - * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. - */ -struct unwind_hint { - u32 ip; - s16 sp_offset; - u8 sp_reg; - u8 type; - u8 end; -}; #endif /* __ASSEMBLY__ */ #endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 565ad755c785..f462895a33e4 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -42,6 +42,17 @@ #endif /* CONFIG_X86_PAE */ /* + * User space process size: 3GB (default). + */ +#define IA32_PAGE_OFFSET __PAGE_OFFSET +#define TASK_SIZE __PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE +#define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +/* * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 288b065955b7..3f49dac03617 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -28,6 +28,7 @@ #define IST_INDEX_NMI 1 #define IST_INDEX_DB 2 #define IST_INDEX_MCE 3 +#define IST_INDEX_VC 4 /* * Set __PAGE_OFFSET to the most negative possible address + @@ -59,6 +60,44 @@ #endif /* + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ +#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) +#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + +/* * Maximum kernel image size is limited to 1 GiB, due to the fixmap living * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). * diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 3d2afecde50c..d25cc6830e89 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -160,8 +160,6 @@ static inline void wbinvd(void) PVOP_VCALL0(cpu.wbinvd); } -#define get_kernel_rpl() (pv_info.kernel_rpl) - static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); @@ -277,12 +275,10 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu) PVOP_VCALL2(cpu.load_tls, t, cpu); } -#ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } -#endif static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) @@ -375,52 +371,22 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { - pteval_t ret; - - if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, mmu.make_pte, val, (u64)val >> 32); - else - ret = PVOP_CALLEE1(pteval_t, mmu.make_pte, val); - - return (pte_t) { .pte = ret }; + return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) }; } static inline pteval_t pte_val(pte_t pte) { - pteval_t ret; - - if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, mmu.pte_val, - pte.pte, (u64)pte.pte >> 32); - else - ret = PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); - - return ret; + return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); } static inline pgd_t __pgd(pgdval_t val) { - pgdval_t ret; - - if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, mmu.make_pgd, val, (u64)val >> 32); - else - ret = PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val); - - return (pgd_t) { ret }; + return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) }; } static inline pgdval_t pgd_val(pgd_t pgd) { - pgdval_t ret; - - if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, mmu.pgd_val, - pgd.pgd, (u64)pgd.pgd >> 32); - else - ret = PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); - - return ret; + return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -438,78 +404,34 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned pte_t *ptep, pte_t old_pte, pte_t pte) { - if (sizeof(pteval_t) > sizeof(long)) - /* 5 arg words */ - pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); - else - PVOP_VCALL4(mmu.ptep_modify_prot_commit, - vma, addr, ptep, pte.pte); + PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { - if (sizeof(pteval_t) > sizeof(long)) - PVOP_VCALL3(mmu.set_pte, ptep, pte.pte, (u64)pte.pte >> 32); - else - PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); -} - -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - if (sizeof(pteval_t) > sizeof(long)) - /* 5 arg words */ - pv_ops.mmu.set_pte_at(mm, addr, ptep, pte); - else - PVOP_VCALL4(mmu.set_pte_at, mm, addr, ptep, pte.pte); + PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { - pmdval_t val = native_pmd_val(pmd); - - if (sizeof(pmdval_t) > sizeof(long)) - PVOP_VCALL3(mmu.set_pmd, pmdp, val, (u64)val >> 32); - else - PVOP_VCALL2(mmu.set_pmd, pmdp, val); + PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } -#if CONFIG_PGTABLE_LEVELS >= 3 static inline pmd_t __pmd(pmdval_t val) { - pmdval_t ret; - - if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, mmu.make_pmd, val, (u64)val >> 32); - else - ret = PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val); - - return (pmd_t) { ret }; + return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) }; } static inline pmdval_t pmd_val(pmd_t pmd) { - pmdval_t ret; - - if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, mmu.pmd_val, - pmd.pmd, (u64)pmd.pmd >> 32); - else - ret = PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); - - return ret; + return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); } static inline void set_pud(pud_t *pudp, pud_t pud) { - pudval_t val = native_pud_val(pud); - - if (sizeof(pudval_t) > sizeof(long)) - PVOP_VCALL3(mmu.set_pud, pudp, val, (u64)val >> 32); - else - PVOP_VCALL2(mmu.set_pud, pudp, val); + PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } -#if CONFIG_PGTABLE_LEVELS >= 4 + static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -526,7 +448,7 @@ static inline pudval_t pud_val(pud_t pud) static inline void pud_clear(pud_t *pudp) { - set_pud(pudp, __pud(0)); + set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) @@ -563,40 +485,17 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) } while (0) #define pgd_clear(pgdp) do { \ - if (pgtable_l5_enabled()) \ - set_pgd(pgdp, __pgd(0)); \ + if (pgtable_l5_enabled()) \ + set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { - set_p4d(p4dp, __p4d(0)); + set_p4d(p4dp, native_make_p4d(0)); } -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ - -#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ - -#ifdef CONFIG_X86_PAE -/* Special-case pte-setting operations for PAE, which can't update a - 64-bit pte atomically */ -static inline void set_pte_atomic(pte_t *ptep, pte_t pte) -{ - PVOP_VCALL3(mmu.set_pte_atomic, ptep, pte.pte, pte.pte >> 32); -} - -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - PVOP_VCALL3(mmu.pte_clear, mm, addr, ptep); -} - -static inline void pmd_clear(pmd_t *pmdp) -{ - PVOP_VCALL1(mmu.pmd_clear, pmdp); -} -#else /* !CONFIG_X86_PAE */ static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); @@ -605,14 +504,13 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { - set_pmd(pmdp, __pmd(0)); + set_pmd(pmdp, native_make_pmd(0)); } -#endif /* CONFIG_X86_PAE */ #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) @@ -682,16 +580,9 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 -#define PV_SAVE_REGS "pushl %ecx; pushl %edx;" -#define PV_RESTORE_REGS "popl %edx; popl %ecx;" - /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" - -#define PV_FLAGS_ARG "0" -#define PV_EXTRA_CLOBBERS -#define PV_VEXTRA_CLOBBERS #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ @@ -712,14 +603,6 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" - -/* We save some registers, but all of them, that's too much. We clobber all - * caller saved registers but the argument parameter */ -#define PV_SAVE_REGS "pushq %%rdi;" -#define PV_RESTORE_REGS "popq %%rdi;" -#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi" -#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi" -#define PV_FLAGS_ARG "D" #endif /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8dfcb2508e6d..0fad9f61c76a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -68,13 +68,8 @@ struct paravirt_callee_save { /* general info */ struct pv_info { #ifdef CONFIG_PARAVIRT_XXL - unsigned int kernel_rpl; - int shared_kernel_pmd; - -#ifdef CONFIG_X86_64 u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif -#endif const char *name; }; @@ -126,9 +121,7 @@ struct pv_cpu_ops { void (*set_ldt)(const void *desc, unsigned entries); unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 void (*load_gs_index)(unsigned int idx); -#endif void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, const void *desc); void (*write_gdt_entry)(struct desc_struct *, @@ -249,8 +242,6 @@ struct pv_mmu_ops { /* Pagetable manipulation functions */ void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, @@ -264,21 +255,11 @@ struct pv_mmu_ops { struct paravirt_callee_save pgd_val; struct paravirt_callee_save make_pgd; -#if CONFIG_PGTABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - void (*set_pud)(pud_t *pudp, pud_t pudval); struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if CONFIG_PGTABLE_LEVELS >= 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; @@ -291,10 +272,6 @@ struct pv_mmu_ops { void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ - -#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 7ccb338507e3..d2c76c8d8cfd 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -105,17 +105,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -#ifdef CONFIG_PCI_MSI -/* implemented in arch/x86/kernel/apic/io_apic. */ -struct msi_desc; -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); -void native_teardown_msi_irq(unsigned int irq); -void native_restore_msi_irqs(struct pci_dev *dev); -#else -#define native_setup_msi_irqs NULL -#define native_teardown_msi_irq NULL -#endif - /* generic pci stuff */ #include <asm-generic/pci.h> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 73bb404f4d2a..490411dba438 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -114,9 +114,20 @@ extern const struct pci_raw_ops pci_direct_conf1; extern bool port_cf9_safe; /* arch_initcall level */ +#ifdef CONFIG_PCI_DIRECT extern int pci_direct_probe(void); extern void pci_direct_init(int type); +#else +static inline int pci_direct_probe(void) { return -1; } +static inline void pci_direct_init(int type) { } +#endif + +#ifdef CONFIG_PCI_BIOS extern void pci_pcbios_init(void); +#else +static inline void pci_pcbios_init(void) { } +#endif + extern void __init dmi_check_pciprobe(void); extern void __init dmi_check_skip_isa_align(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0c1b13720525..6960cd6d1f23 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -196,13 +196,29 @@ struct x86_pmu_capability { * Fixed-purpose performance events: */ +/* RDPMC offset for Fixed PMCs */ +#define INTEL_PMC_FIXED_RDPMC_BASE (1 << 30) +#define INTEL_PMC_FIXED_RDPMC_METRICS (1 << 29) + /* - * All 3 fixed-mode PMCs are configured via this single MSR: + * All the fixed-mode PMCs are configured via this single MSR: */ #define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* - * The counts are available in three separate MSRs: + * There is no event-code assigned to the fixed-mode PMCs. + * + * For a fixed-mode PMC, which has an equivalent event on a general-purpose + * PMC, the event-code of the equivalent event is used for the fixed-mode PMC, + * e.g., Instr_Retired.Any and CPU_CLK_Unhalted.Core. + * + * For a fixed-mode PMC, which doesn't have an equivalent event, a + * pseudo-encoding is used, e.g., CPU_CLK_Unhalted.Ref and TOPDOWN.SLOTS. + * The pseudo event-code for a fixed-mode PMC must be 0x00. + * The pseudo umask-code is 0xX. The X equals the index of the fixed + * counter + 1, e.g., the fixed counter 2 has the pseudo-encoding 0x0300. + * + * The counts are available in separate MSRs: */ /* Instr_Retired.Any: */ @@ -213,29 +229,84 @@ struct x86_pmu_capability { #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a #define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) -/* CPU_CLK_Unhalted.Ref: */ +/* CPU_CLK_Unhalted.Ref: event=0x00,umask=0x3 (pseudo-encoding) */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) #define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) +/* TOPDOWN.SLOTS: event=0x00,umask=0x4 (pseudo-encoding) */ +#define MSR_ARCH_PERFMON_FIXED_CTR3 0x30c +#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) +#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) + /* * We model BTS tracing as another fixed-mode PMC. * - * We choose a value in the middle of the fixed event range, since lower + * We choose the value 47 for the fixed index of BTS, since lower * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 15) + +/* + * The PERF_METRICS MSR is modeled as several magic fixed-mode PMCs, one for + * each TopDown metric event. + * + * Internally the TopDown metric events are mapped to the FxCtr 3 (SLOTS). + */ +#define INTEL_PMC_IDX_METRIC_BASE (INTEL_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_TD_RETIRING (INTEL_PMC_IDX_METRIC_BASE + 0) +#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1) +#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2) +#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3) +#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND +#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \ + INTEL_PMC_MSK_FIXED_SLOTS) -#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) -#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) -#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) -#define GLOBAL_STATUS_ASIF BIT_ULL(60) -#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) -#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 -#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) -#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) +/* + * There is no event-code assigned to the TopDown events. + * + * For the slots event, use the pseudo code of the fixed counter 3. + * + * For the metric events, the pseudo event-code is 0x00. + * The pseudo umask-code starts from the middle of the pseudo event + * space, 0x80. + */ +#define INTEL_TD_SLOTS 0x0400 /* TOPDOWN.SLOTS */ +/* Level 1 metrics */ +#define INTEL_TD_METRIC_RETIRING 0x8000 /* Retiring metric */ +#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */ +#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */ +#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */ +#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND +#define INTEL_TD_METRIC_NUM 4 + +static inline bool is_metric_idx(int idx) +{ + return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; +} + +static inline bool is_topdown_idx(int idx) +{ + return is_metric_idx(idx) || idx == INTEL_PMC_IDX_FIXED_SLOTS; +} +#define INTEL_PMC_OTHER_TOPDOWN_BITS(bit) \ + (~(0x1ull << bit) & INTEL_PMC_MSK_TOPDOWN) + +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF_BIT 62 +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) +#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 +#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 + +#define GLOBAL_CTRL_EN_PERF_METRICS 48 /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * @@ -334,6 +405,7 @@ struct pebs_xmm { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ #define IBS_RIP_INVALID (1ULL<<38) #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 80fbb4a9ed87..56baf43befb4 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -20,12 +20,7 @@ typedef union { } pte_t; #endif /* !__ASSEMBLY__ */ -#ifdef CONFIG_PARAVIRT_XXL -#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \ - (pv_info.shared_kernel_pmd))) -#else #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) -#endif #define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b836138ce852..a02c67291cfc 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,7 +28,7 @@ #include <asm-generic/pgtable_uffd.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, @@ -63,7 +63,6 @@ extern pmdval_t early_pmd_flags; #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT_XXL */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) -#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -1033,10 +1032,10 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , pte_t pte) +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { - native_set_pte(ptep, pte); + set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 8f63efb2a2cc..52e5f5f2240d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,6 +159,4 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) -#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) - #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 97143d87994c..82a08b585818 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -482,10 +482,6 @@ extern unsigned int fpu_user_xstate_size; struct perf_event; -typedef struct { - unsigned long seg; -} mm_segment_t; - struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -517,7 +513,7 @@ struct thread_struct { /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ - unsigned long debugreg6; + unsigned long virtual_dr6; /* Keep track of the exact dr7 value set by the user */ unsigned long ptrace_dr7; /* Fault info: */ @@ -538,8 +534,6 @@ struct thread_struct { */ unsigned long iopl_emul; - mm_segment_t addr_limit; - unsigned int sig_on_uaccess_err:1; /* Floating point and extended processor state */ @@ -696,6 +690,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cpu_init_exception_handling(void); extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) @@ -782,67 +777,15 @@ static inline void spin_lock_prefetch(const void *x) }) #ifdef CONFIG_X86_32 -/* - * User space process size: 3GB (default). - */ -#define IA32_PAGE_OFFSET PAGE_OFFSET -#define TASK_SIZE PAGE_OFFSET -#define TASK_SIZE_LOW TASK_SIZE -#define TASK_SIZE_MAX TASK_SIZE -#define DEFAULT_MAP_WINDOW TASK_SIZE -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ - .addr_limit = KERNEL_DS, \ } #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -/* - * User space process size. This is the first address outside the user range. - * There are a few constraints that determine this: - * - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical - * address, then that syscall will enter the kernel with a - * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable - * from being mapped at the maximum canonical address. - * - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the - * CPUs malfunction if they execute code from the highest canonical page. - * They'll speculate right off the end of the canonical space, and - * bad things happen. This is worked around in the same way as the - * Intel problem. - * - * With page table isolation enabled, we map the LDT in ... [stay tuned] - */ -#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) - -#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) - -#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ - IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) -#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) - -#define STACK_TOP TASK_SIZE_LOW -#define STACK_TOP_MAX TASK_SIZE_MAX - -#define INIT_THREAD { \ - .addr_limit = KERNEL_DS, \ -} +#define INIT_THREAD { } extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 28996fe19301..2c35f1c01a2d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -10,6 +10,7 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); +void entry_SYSCALL_64_safe_stack(void); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b35030eeec36..5db5d083c873 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,6 +21,9 @@ struct real_mode_header { /* SMP trampoline */ u32 trampoline_start; u32 trampoline_header; +#ifdef CONFIG_AMD_MEM_ENCRYPT + u32 sev_es_trampoline_start; +#endif #ifdef CONFIG_X86_64 u32 trampoline_pgd; #endif @@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; extern unsigned long initial_gs; extern unsigned long initial_stack; +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern unsigned long initial_vc_handler; +#endif extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; @@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[]; extern unsigned char boot_gdt[]; #else extern unsigned char secondary_startup_64[]; +extern unsigned char secondary_startup_64_no_verify[]; #endif static inline size_t real_mode_size_needed(void) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6847d85400a8..3ff0d48469f2 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -54,7 +54,7 @@ #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL /* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 #define NEED_PGE 0 diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 9646c300f128..7fdd4facfce7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -222,15 +222,11 @@ #endif -#ifndef CONFIG_PARAVIRT_XXL -# define get_kernel_rpl() 0 -#endif - #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack: */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 +#define EXCEPTION_ERRCODE_MASK 0x20027d00 #define GDT_SIZE (GDT_ENTRIES*8) #define GDT_ENTRY_TLS_ENTRIES 3 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 84b645cc8bc9..389d851a02c4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,8 @@ void vsmp_init(void); static inline void vsmp_init(void) { } #endif +struct pt_regs; + void setup_bios_corruption_check(void); void early_platform_quirks(void); @@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); extern unsigned long __startup_secondary_64(void); -extern int early_make_pgtable(unsigned long address); +extern void startup_64_setup_env(unsigned long physbase); +extern void early_setup_idt(void); +extern void __init do_early_exception(struct pt_regs *regs, int trapnr); #ifdef CONFIG_X86_INTEL_MID extern void x86_intel_mid_early_setup(void); @@ -115,7 +119,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard.text) __used notrace \ + static void __section(".discard.text") __used notrace \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h new file mode 100644 index 000000000000..cf1d957c7091 --- /dev/null +++ b/arch/x86/include/asm/sev-es.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#ifndef __ASM_ENCRYPTED_STATE_H +#define __ASM_ENCRYPTED_STATE_H + +#include <linux/types.h> +#include <asm/insn.h> + +#define GHCB_SEV_INFO 0x001UL +#define GHCB_SEV_INFO_REQ 0x002UL +#define GHCB_INFO(v) ((v) & 0xfffUL) +#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) +#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_SEV_CPUID_REQ 0x004UL +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ + (((unsigned long)reg & 3) << 30) | \ + (((unsigned long)fn) << 32)) + +#define GHCB_PROTOCOL_MAX 0x0001UL +#define GHCB_DEFAULT_USAGE 0x0000UL + +#define GHCB_SEV_CPUID_RESP 0x005UL +#define GHCB_SEV_TERMINATE 0x100UL +#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & 0x7) << 12) | \ + ((((u64)reason_val) & 0xff) << 16)) +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) +#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } + +enum es_result { + ES_OK, /* All good */ + ES_UNSUPPORTED, /* Requested operation not supported */ + ES_VMM_ERROR, /* Unexpected state from the VMM */ + ES_DECODE_FAILED, /* Instruction decoding failed */ + ES_EXCEPTION, /* Instruction caused exception */ + ES_RETRY, /* Retry instruction emulation */ +}; + +struct es_fault_info { + unsigned long vector; + unsigned long error_code; + unsigned long cr2; +}; + +struct pt_regs; + +/* ES instruction emulation context */ +struct es_em_ctxt { + struct pt_regs *regs; + struct insn insn; + struct es_fault_info fi; +}; + +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); + +static inline u64 lower_bits(u64 val, unsigned int bits) +{ + u64 mask = (1ULL << bits) - 1; + + return (val & mask); +} + +struct real_mode_header; +enum stack_type; + +/* Early IDT entry points for #VC handler */ +extern void vc_no_ghcb(void); +extern void vc_boot_ghcb(void); +extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern struct static_key_false sev_es_enable_key; +extern void __sev_es_ist_enter(struct pt_regs *regs); +extern void __sev_es_ist_exit(void); +static __always_inline void sev_es_ist_enter(struct pt_regs *regs) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_enter(regs); +} +static __always_inline void sev_es_ist_exit(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_exit(); +} +extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); +extern void __sev_es_nmi_complete(void); +static __always_inline void sev_es_nmi_complete(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_nmi_complete(); +} +extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } +static inline void sev_es_nmi_complete(void) { } +static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +#endif + +#endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 59a3e13204c3..cc177b4431ae 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -11,45 +11,47 @@ #include <linux/jump_label.h> /* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization + * The compiler should not reorder volatile asm statements with respect to each + * other: they should execute in program order. However GCC 4.9.x and 5.x have + * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder + * volatile asm. The write functions are not affected since they have memory + * clobbers preventing reordering. To prevent reads from being reordered with + * respect to writes, use a dummy memory operand. */ -extern unsigned long __force_order; + +#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) @@ -64,10 +66,10 @@ static inline unsigned long native_read_cr4(void) asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); + : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } @@ -234,6 +236,76 @@ static inline void clwb(volatile void *__p) #define nop() asm volatile ("nop") +static inline void serialize(void) +{ + /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ + asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); +} + +/* The dst parameter must be 64-bytes aligned */ +static inline void movdir64b(void *dst, const void *src) +{ + const struct { char _[64]; } *__src = src; + struct { char _[64]; } *__dst = dst; + + /* + * MOVDIR64B %(rdx), rax. + * + * Both __src and __dst must be memory constraints in order to tell the + * compiler that no other memory accesses should be reordered around + * this one. + * + * Also, both must be supplied as lvalues because this tells + * the compiler what the object is (its size) the instruction accesses. + * I.e., not the pointers but what they point to, thus the deref'ing '*'. + */ + asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" + : "+m" (*__dst) + : "m" (*__src), "a" (__dst), "d" (__src)); +} + +/** + * enqcmds - Enqueue a command in supervisor (CPL0) mode + * @dst: destination, in MMIO space (must be 512-bit aligned) + * @src: 512 bits memory operand + * + * The ENQCMDS instruction allows software to write a 512-bit command to + * a 512-bit-aligned special MMIO region that supports the instruction. + * A return status is loaded into the ZF flag in the RFLAGS register. + * ZF = 0 equates to success, and ZF = 1 indicates retry or error. + * + * This function issues the ENQCMDS instruction to submit data from + * kernel space to MMIO space, in a unit of 512 bits. Order of data access + * is not guaranteed, nor is a memory barrier performed afterwards. It + * returns 0 on success and -EAGAIN on failure. + * + * Warning: Do not use this helper unless your driver has checked that the + * ENQCMDS instruction is supported on the platform and the device accepts + * ENQCMDS. + */ +static inline int enqcmds(void __iomem *dst, const void *src) +{ + const struct { char _[64]; } *__src = src; + struct { char _[64]; } *__dst = dst; + int zf; + + /* + * ENQCMDS %(rdx), rax + * + * See movdir64b()'s comment on operand specification. + */ + asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" + CC_SET(z) + : CC_OUT(z) (zf), "+m" (*__dst) + : "m" (*__src), "a" (__dst), "d" (__src)); + + /* Submission failure is indicated via EFLAGS.ZF=1 */ + if (zf) + return -EAGAIN; + + return 0; +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 5ae5a68e469d..49600643faba 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); +bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info); const char *stack_type_name(enum stack_type type); diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h new file mode 100644 index 000000000000..c37f11999d0c --- /dev/null +++ b/arch/x86/include/asm/static_call.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_STATIC_CALL_H +#define _ASM_STATIC_CALL_H + +#include <asm/text-patching.h> + +/* + * For CONFIG_HAVE_STATIC_CALL_INLINE, this is a temporary trampoline which + * uses the current value of the key->func pointer to do an indirect jump to + * the function. This trampoline is only used during boot, before the call + * sites get patched by static_call_update(). The name of this trampoline has + * a magical aspect: objtool uses it to find static call sites so it can create + * the .static_call_sites section. + * + * For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which + * does a direct jump to the function. The direct jump gets patched by + * static_call_update(). + * + * Having the trampoline in a special section forces GCC to emit a JMP.d32 when + * it does tail-call optimization on the call; since you cannot compute the + * relative displacement across sections. + */ + +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insns) \ + asm(".pushsection .static_call.text, \"ax\" \n" \ + ".align 4 \n" \ + ".globl " STATIC_CALL_TRAMP_STR(name) " \n" \ + STATIC_CALL_TRAMP_STR(name) ": \n" \ + insns " \n" \ + ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \ + ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \ + ".popsection \n") + +#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ + __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") + +#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ + __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop") + +#endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 75314c3dbe47..6e450827f677 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -82,38 +82,6 @@ int strcmp(const char *cs, const char *ct); #endif -#define __HAVE_ARCH_MEMCPY_MCSAFE 1 -__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src, - size_t cnt); -DECLARE_STATIC_KEY_FALSE(mcsafe_key); - -/** - * memcpy_mcsafe - copy memory with indication if a machine check happened - * - * @dst: destination address - * @src: source address - * @cnt: number of bytes to copy - * - * Low level memory copy function that catches machine checks - * We only call into the "safe" function on systems that can - * actually do machine check recovery. Everyone else can just - * use memcpy(). - * - * Return 0 for success, or number of bytes not copied if there was an - * exception. - */ -static __always_inline __must_check unsigned long -memcpy_mcsafe(void *dst, const void *src, size_t cnt) -{ -#ifdef CONFIG_X86_MCE - if (static_branch_unlikely(&mcsafe_key)) - return __memcpy_mcsafe(dst, src, cnt); - else -#endif - memcpy(dst, src, cnt); - return 0; -} - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 8a1f5382a4ea..71d630bb5e08 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -3,10 +3,54 @@ #define __SVM_H #include <uapi/asm/svm.h> - +#include <uapi/asm/kvm.h> + +/* + * 32-bit intercept words in the VMCB Control Area, starting + * at Byte offset 000h. + */ + +enum intercept_words { + INTERCEPT_CR = 0, + INTERCEPT_DR, + INTERCEPT_EXCEPTION, + INTERCEPT_WORD3, + INTERCEPT_WORD4, + INTERCEPT_WORD5, + MAX_INTERCEPT, +}; enum { - INTERCEPT_INTR, + /* Byte offset 000h (word 0) */ + INTERCEPT_CR0_READ = 0, + INTERCEPT_CR3_READ = 3, + INTERCEPT_CR4_READ = 4, + INTERCEPT_CR8_READ = 8, + INTERCEPT_CR0_WRITE = 16, + INTERCEPT_CR3_WRITE = 16 + 3, + INTERCEPT_CR4_WRITE = 16 + 4, + INTERCEPT_CR8_WRITE = 16 + 8, + /* Byte offset 004h (word 1) */ + INTERCEPT_DR0_READ = 32, + INTERCEPT_DR1_READ, + INTERCEPT_DR2_READ, + INTERCEPT_DR3_READ, + INTERCEPT_DR4_READ, + INTERCEPT_DR5_READ, + INTERCEPT_DR6_READ, + INTERCEPT_DR7_READ, + INTERCEPT_DR0_WRITE = 48, + INTERCEPT_DR1_WRITE, + INTERCEPT_DR2_WRITE, + INTERCEPT_DR3_WRITE, + INTERCEPT_DR4_WRITE, + INTERCEPT_DR5_WRITE, + INTERCEPT_DR6_WRITE, + INTERCEPT_DR7_WRITE, + /* Byte offset 008h (word 2) */ + INTERCEPT_EXCEPTION_OFFSET = 64, + /* Byte offset 00Ch (word 3) */ + INTERCEPT_INTR = 96, INTERCEPT_NMI, INTERCEPT_SMI, INTERCEPT_INIT, @@ -38,7 +82,8 @@ enum { INTERCEPT_TASK_SWITCH, INTERCEPT_FERR_FREEZE, INTERCEPT_SHUTDOWN, - INTERCEPT_VMRUN, + /* Byte offset 010h (word 4) */ + INTERCEPT_VMRUN = 128, INTERCEPT_VMMCALL, INTERCEPT_VMLOAD, INTERCEPT_VMSAVE, @@ -53,15 +98,18 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + /* Byte offset 014h (word 5) */ + INTERCEPT_INVLPGB = 160, + INTERCEPT_INVLPGB_ILLEGAL, + INTERCEPT_INVPCID, + INTERCEPT_MCOMMIT, + INTERCEPT_TLBSYNC, }; struct __attribute__ ((__packed__)) vmcb_control_area { - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - u8 reserved_1[40]; + u32 intercepts[MAX_INTERCEPT]; + u32 reserved_1[15 - MAX_INTERCEPT]; u16 pause_filter_thresh; u16 pause_filter_count; u64 iopm_base_pa; @@ -150,14 +198,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) -struct __attribute__ ((__packed__)) vmcb_seg { +struct vmcb_seg { u16 selector; u16 attrib; u32 limit; u64 base; -}; +} __packed; -struct __attribute__ ((__packed__)) vmcb_save_area { +struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; struct vmcb_seg ss; @@ -200,20 +248,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 br_to; u64 last_excp_from; u64 last_excp_to; -}; + /* + * The following part of the save area is valid only for + * SEV-ES guests when referenced through the GHCB. + */ + u8 reserved_7[104]; + u64 reserved_8; /* rax already available at 0x01f8 */ + u64 rcx; + u64 rdx; + u64 rbx; + u64 reserved_9; /* rsp already available at 0x01d8 */ + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_10[16]; + u64 sw_exit_code; + u64 sw_exit_info_1; + u64 sw_exit_info_2; + u64 sw_scratch; + u8 reserved_11[56]; + u64 xcr0; + u8 valid_bitmap[16]; + u64 x87_state_gpa; +} __packed; + +struct ghcb { + struct vmcb_save_area save; + u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + + u8 shared_buffer[2032]; + + u8 reserved_1[10]; + u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ + u32 ghcb_usage; +} __packed; + + +#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256 +#define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { - BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); - BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } -struct __attribute__ ((__packed__)) vmcb { +struct vmcb { struct vmcb_control_area control; u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; -}; +} __packed; #define SVM_CPUID_FUNC 0x8000000a @@ -240,32 +335,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_CR0_READ 0 -#define INTERCEPT_CR3_READ 3 -#define INTERCEPT_CR4_READ 4 -#define INTERCEPT_CR8_READ 8 -#define INTERCEPT_CR0_WRITE (16 + 0) -#define INTERCEPT_CR3_WRITE (16 + 3) -#define INTERCEPT_CR4_WRITE (16 + 4) -#define INTERCEPT_CR8_WRITE (16 + 8) - -#define INTERCEPT_DR0_READ 0 -#define INTERCEPT_DR1_READ 1 -#define INTERCEPT_DR2_READ 2 -#define INTERCEPT_DR3_READ 3 -#define INTERCEPT_DR4_READ 4 -#define INTERCEPT_DR5_READ 5 -#define INTERCEPT_DR6_READ 6 -#define INTERCEPT_DR7_READ 7 -#define INTERCEPT_DR0_WRITE (16 + 0) -#define INTERCEPT_DR1_WRITE (16 + 1) -#define INTERCEPT_DR2_WRITE (16 + 2) -#define INTERCEPT_DR3_WRITE (16 + 3) -#define INTERCEPT_DR4_WRITE (16 + 4) -#define INTERCEPT_DR5_WRITE (16 + 5) -#define INTERCEPT_DR6_WRITE (16 + 6) -#define INTERCEPT_DR7_WRITE (16 + 7) - #define SVM_EVTINJ_VEC_MASK 0xff #define SVM_EVTINJ_TYPE_SHIFT 8 @@ -298,4 +367,47 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) +/* GHCB Accessor functions */ + +#define GHCB_BITMAP_IDX(field) \ + (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + +#define DEFINE_GHCB_ACCESSORS(field) \ + static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + { \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + } \ + \ + static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + { \ + __set_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + ghcb->save.field = value; \ + } + +DEFINE_GHCB_ACCESSORS(cpl) +DEFINE_GHCB_ACCESSORS(rip) +DEFINE_GHCB_ACCESSORS(rsp) +DEFINE_GHCB_ACCESSORS(rax) +DEFINE_GHCB_ACCESSORS(rcx) +DEFINE_GHCB_ACCESSORS(rdx) +DEFINE_GHCB_ACCESSORS(rbx) +DEFINE_GHCB_ACCESSORS(rbp) +DEFINE_GHCB_ACCESSORS(rsi) +DEFINE_GHCB_ACCESSORS(rdi) +DEFINE_GHCB_ACCESSORS(r8) +DEFINE_GHCB_ACCESSORS(r9) +DEFINE_GHCB_ACCESSORS(r10) +DEFINE_GHCB_ACCESSORS(r11) +DEFINE_GHCB_ACCESSORS(r12) +DEFINE_GHCB_ACCESSORS(r13) +DEFINE_GHCB_ACCESSORS(r14) +DEFINE_GHCB_ACCESSORS(r15) +DEFINE_GHCB_ACCESSORS(sw_exit_code) +DEFINE_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_GHCB_ACCESSORS(sw_scratch) +DEFINE_GHCB_ACCESSORS(xcr0) + #endif diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index fdb5b356e59b..0fd4a9dfb29c 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -5,6 +5,7 @@ #include <linux/preempt.h> #include <asm/processor.h> #include <asm/cpufeature.h> +#include <asm/special_insns.h> #ifdef CONFIG_X86_32 static inline void iret_to_self(void) @@ -46,22 +47,34 @@ static inline void iret_to_self(void) * * b) Text was modified on a different CPU, may subsequently be * executed on this CPU, and you want to make sure the new version - * gets executed. This generally means you're calling this in a IPI. + * gets executed. This generally means you're calling this in an IPI. * * If you're calling this for a different reason, you're probably doing * it wrong. + * + * Like all of Linux's memory ordering operations, this is a + * compiler barrier as well. */ static inline void sync_core(void) { /* - * There are quite a few ways to do this. IRET-to-self is nice - * because it works on every CPU, at any CPL (so it's compatible - * with paravirtualization), and it never exits to a hypervisor. - * The only down sides are that it's a bit slow (it seems to be - * a bit more than 2x slower than the fastest options) and that - * it unmasks NMIs. The "push %cs" is needed because, in - * paravirtual environments, __KERNEL_CS may not be a valid CS - * value when we do IRET directly. + * The SERIALIZE instruction is the most straightforward way to + * do this, but it is not universally available. + */ + if (static_cpu_has(X86_FEATURE_SERIALIZE)) { + serialize(); + return; + } + + /* + * For all other processors, there are quite a few ways to do this. + * IRET-to-self is nice because it works on every CPU, at any CPL + * (so it's compatible with paravirtualization), and it never exits + * to a hypervisor. The only downsides are that it's a bit slow + * (it seems to be a bit more than 2x slower than the fastest + * options) and that it unmasks NMIs. The "push %cs" is needed, + * because in paravirtual environments __KERNEL_CS may not be a + * valid CS value when we do IRET directly. * * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an @@ -71,9 +84,6 @@ static inline void sync_core(void) * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a * hypervisor. - * - * Like all of Linux's memory ordering operations, this is a - * compiler barrier as well. */ iret_to_self(); } diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 6593b42cb379..b7421780e4e9 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -53,6 +53,9 @@ extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC +#define RET_INSN_SIZE 1 +#define RET_INSN_OPCODE 0xC3 + #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 @@ -73,6 +76,7 @@ static __always_inline int text_opcode_size(u8 opcode) switch(opcode) { __CASE(INT3); + __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); @@ -141,11 +145,26 @@ void int3_emulate_push(struct pt_regs *regs, unsigned long val) } static __always_inline +unsigned long int3_emulate_pop(struct pt_regs *regs) +{ + unsigned long val = *(unsigned long *)regs->sp; + regs->sp += sizeof(unsigned long); + return val; +} + +static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } + +static __always_inline +void int3_emulate_ret(struct pt_regs *regs) +{ + unsigned long ip = int3_emulate_pop(regs); + int3_emulate_jmp(regs, ip); +} #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 267701ae3d86..44733a4bfc42 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -102,7 +102,6 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ -#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -131,7 +130,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h new file mode 100644 index 000000000000..305bc1214aef --- /dev/null +++ b/arch/x86/include/asm/trap_pf.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_TRAP_PF_H +#define _ASM_X86_TRAP_PF_H + +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; + +#endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index 082f45631fa9..f5d2325aa0b7 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -26,6 +26,7 @@ #define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */ #define X86_TRAP_VE 20 /* Virtualization Exception */ #define X86_TRAP_CP 21 /* Control Protection Exception */ +#define X86_TRAP_VC 29 /* VMM Communication Exception */ #define X86_TRAP_IRET 32 /* IRET Exception */ #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 714b1a30e7b0..7f7200021bd1 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -8,12 +8,14 @@ #include <asm/debugreg.h> #include <asm/idtentry.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ +#include <asm/trap_pf.h> #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif #ifdef CONFIG_X86_F00F_BUG @@ -35,28 +37,12 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); +bool fault_in_kernel_space(unsigned long address); + #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, unsigned long fault_address); #endif -/* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - X86_PF_PROT = 1 << 0, - X86_PF_WRITE = 1 << 1, - X86_PF_USER = 1 << 2, - X86_PF_RSVD = 1 << 3, - X86_PF_INSTR = 1 << 4, - X86_PF_PK = 1 << 5, -}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ecefaffd15d4..c9fa7be3df82 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -13,30 +13,6 @@ #include <asm/extable.h> /* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(-1UL) -#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) - -#define get_fs() (current->thread.addr_limit) -static inline void set_fs(mm_segment_t fs) -{ - current->thread.addr_limit = fs; - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); -} - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#define user_addr_max() (current->thread.addr_limit.seg) - -/* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ @@ -93,28 +69,17 @@ static inline bool pagefault_disabled(void); #define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ - likely(!__range_not_ok(addr, size, user_addr_max())); \ + likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ }) -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - * - * This gets kind of ugly. We want to return _two_ values in "get_user()" - * and yet we don't want to do any pointers, because that is too much - * of a performance impact. Thus we have a few rather ugly macros here, - * and hide all the ugliness from the user. - * - * The "__xxx" versions of the user access functions are versions that - * do not verify the address space, that must have been done previously - * with a separate "access_ok()" call (this is used when we do multiple - * accesses to the same area of user memory). - */ - extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); extern int __get_user_8(void); +extern int __get_user_nocheck_1(void); +extern int __get_user_nocheck_2(void); +extern int __get_user_nocheck_4(void); +extern int __get_user_nocheck_8(void); extern int __get_user_bad(void); #define __uaccess_begin() stac() @@ -138,25 +103,12 @@ extern int __get_user_bad(void); #define __typefits(x,type,not) \ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not) -/** - * get_user - Get a simple variable from user space. - * @x: Variable to store result. - * @ptr: Source address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple variable from user space to kernel - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and the result of - * dereferencing @ptr must be assignable to @x without a cast. - * - * Return: zero on success, or -EFAULT on error. - * On error, the variable @x is set to zero. - */ /* + * This is used for both get_user() and __get_user() to expand to + * the proper special function call that has odd calling conventions + * due to returning both a value and an error, and that depends on + * the size of the pointer passed in. + * * Careful: we have to cast the result to the type of the pointer * for sign reasons. * @@ -169,13 +121,12 @@ extern int __get_user_bad(void); * Clang/LLVM cares about the size of the register, but still wants * the base register for something that ends up being a pair. */ -#define get_user(x, ptr) \ +#define do_get_user_call(fn,x,ptr) \ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - might_fault(); \ - asm volatile("call __get_user_%P4" \ + asm volatile("call __" #fn "_%P4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ @@ -183,10 +134,48 @@ extern int __get_user_bad(void); __builtin_expect(__ret_gu, 0); \ }) -#define __put_user_x(size, x, ptr, __ret_pu) \ - asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ - : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") +/** + * get_user - Get a simple variable from user space. + * @x: Variable to store result. + * @ptr: Source address, in user space. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * This macro copies a single simple variable from user space to kernel + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and the result of + * dereferencing @ptr must be assignable to @x without a cast. + * + * Return: zero on success, or -EFAULT on error. + * On error, the variable @x is set to zero. + */ +#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) +/** + * __get_user - Get a simple variable from user space, with less checking. + * @x: Variable to store result. + * @ptr: Source address, in user space. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * This macro copies a single simple variable from user space to kernel + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and the result of + * dereferencing @ptr must be assignable to @x without a cast. + * + * Caller must check the pointer with access_ok() before calling this + * function. + * + * Return: zero on success, or -EFAULT on error. + * On error, the variable @x is set to zero. + */ +#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr) #ifdef CONFIG_X86_32 @@ -199,25 +188,49 @@ extern int __get_user_bad(void); : : "A" (x), "r" (addr) \ : : label) -#define __put_user_x8(x, ptr, __ret_pu) \ - asm volatile("call __put_user_8" : "=a" (__ret_pu) \ - : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "er", label) -#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif extern void __put_user_bad(void); /* * Strange magic calling convention: pointer in %ecx, - * value in %eax(:%edx), return value in %eax. clobbers %rbx + * value in %eax(:%edx), return value in %ecx. clobbers %rbx */ extern void __put_user_1(void); extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); +extern void __put_user_nocheck_1(void); +extern void __put_user_nocheck_2(void); +extern void __put_user_nocheck_4(void); +extern void __put_user_nocheck_8(void); + +/* + * ptr must be evaluated and assigned to the temporary __ptr_pu before + * the assignment of x to __val_pu, to avoid any function calls + * involved in the ptr expression (possibly implicitly generated due + * to KASAN) from clobbering %ax. + */ +#define do_put_user_call(fn,x,ptr) \ +({ \ + int __ret_pu; \ + void __user *__ptr_pu; \ + register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ + __chk_user_ptr(ptr); \ + __ptr_pu = (ptr); \ + __val_pu = (x); \ + asm volatile("call __" #fn "_%P[size]" \ + : "=c" (__ret_pu), \ + ASM_CALL_CONSTRAINT \ + : "0" (__ptr_pu), \ + "r" (__val_pu), \ + [size] "i" (sizeof(*(ptr))) \ + :"ebx"); \ + __builtin_expect(__ret_pu, 0); \ +}) /** * put_user - Write a simple value into user space. @@ -236,32 +249,29 @@ extern void __put_user_8(void); * * Return: zero on success, or -EFAULT on error. */ -#define put_user(x, ptr) \ -({ \ - int __ret_pu; \ - __typeof__(*(ptr)) __pu_val; \ - __chk_user_ptr(ptr); \ - might_fault(); \ - __pu_val = x; \ - switch (sizeof(*(ptr))) { \ - case 1: \ - __put_user_x(1, __pu_val, ptr, __ret_pu); \ - break; \ - case 2: \ - __put_user_x(2, __pu_val, ptr, __ret_pu); \ - break; \ - case 4: \ - __put_user_x(4, __pu_val, ptr, __ret_pu); \ - break; \ - case 8: \ - __put_user_x8(__pu_val, ptr, __ret_pu); \ - break; \ - default: \ - __put_user_x(X, __pu_val, ptr, __ret_pu); \ - break; \ - } \ - __builtin_expect(__ret_pu, 0); \ -}) +#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); }) + +/** + * __put_user - Write a simple value into user space, with less checking. + * @x: Value to copy to user space. + * @ptr: Destination address, in user space. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * This macro copies a single simple value from kernel space to user + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and @x must be assignable + * to the result of dereferencing @ptr. + * + * Caller must check the pointer with access_ok() before calling this + * function. + * + * Return: zero on success, or -EFAULT on error. + */ +#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr) #define __put_user_size(x, ptr, size, label) \ do { \ @@ -284,6 +294,55 @@ do { \ } \ } while (0) +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#ifdef CONFIG_X86_32 +#define __get_user_asm_u64(x, ptr, label) do { \ + unsigned int __gu_low, __gu_high; \ + const unsigned int __user *__gu_ptr; \ + __gu_ptr = (const void __user *)(ptr); \ + __get_user_asm(__gu_low, ptr, "l", "=r", label); \ + __get_user_asm(__gu_high, ptr+1, "l", "=r", label); \ + (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ +} while (0) +#else +#define __get_user_asm_u64(x, ptr, label) \ + __get_user_asm(x, ptr, "q", "=r", label) +#endif + +#define __get_user_size(x, ptr, size, label) \ +do { \ + __chk_user_ptr(ptr); \ + switch (size) { \ + unsigned char x_u8__; \ + case 1: \ + __get_user_asm(x_u8__, ptr, "b", "=q", label); \ + (x) = x_u8__; \ + break; \ + case 2: \ + __get_user_asm(x, ptr, "w", "=r", label); \ + break; \ + case 4: \ + __get_user_asm(x, ptr, "l", "=r", label); \ + break; \ + case 8: \ + __get_user_asm_u64(x, ptr, label); \ + break; \ + default: \ + (x) = __get_user_bad(); \ + } \ +} while (0) + +#define __get_user_asm(x, addr, itype, ltype, label) \ + asm_volatile_goto("\n" \ + "1: mov"itype" %[umem],%[output]\n" \ + _ASM_EXTABLE_UA(1b, %l2) \ + : [output] ltype(x) \ + : [umem] "m" (__m(addr)) \ + : : label) + +#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT + #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval) \ ({ \ @@ -343,7 +402,7 @@ do { \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %[efault],%[errout]\n" \ - " xor"itype" %[output],%[output]\n" \ + " xorl %k[output],%k[output]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ @@ -352,33 +411,7 @@ do { \ : [umem] "m" (__m(addr)), \ [efault] "i" (-EFAULT), "0" (err)) -#define __put_user_nocheck(x, ptr, size) \ -({ \ - __label__ __pu_label; \ - int __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(ptr) __pu_ptr = (ptr); \ - __typeof__(size) __pu_size = (size); \ - __uaccess_begin(); \ - __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \ - __pu_err = 0; \ -__pu_label: \ - __uaccess_end(); \ - __builtin_expect(__pu_err, 0); \ -}) - -#define __get_user_nocheck(x, ptr, size) \ -({ \ - int __gu_err; \ - __inttype(*(ptr)) __gu_val; \ - __typeof__(ptr) __gu_ptr = (ptr); \ - __typeof__(size) __gu_size = (size); \ - __uaccess_begin_nospec(); \ - __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err); \ - __uaccess_end(); \ - (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) +#endif // CONFIG_CC_ASM_GOTO_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; @@ -396,55 +429,6 @@ struct __large_struct { unsigned long buf[100]; }; : : ltype(x), "m" (__m(addr)) \ : : label) -/** - * __get_user - Get a simple variable from user space, with less checking. - * @x: Variable to store result. - * @ptr: Source address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple variable from user space to kernel - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and the result of - * dereferencing @ptr must be assignable to @x without a cast. - * - * Caller must check the pointer with access_ok() before calling this - * function. - * - * Return: zero on success, or -EFAULT on error. - * On error, the variable @x is set to zero. - */ - -#define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr))) - -/** - * __put_user - Write a simple value into user space, with less checking. - * @x: Value to copy to user space. - * @ptr: Destination address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple value from kernel space to user - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. - * - * Caller must check the pointer with access_ok() before calling this - * function. - * - * Return: zero on success, or -EFAULT on error. - */ - -#define __put_user(x, ptr) \ - __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long @@ -455,6 +439,15 @@ extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); +#ifdef CONFIG_ARCH_HAS_COPY_MC +unsigned long __must_check +copy_mc_to_kernel(void *to, const void *from, unsigned len); +#define copy_mc_to_kernel copy_mc_to_kernel + +unsigned long __must_check +copy_mc_to_user(void *to, const void *from, unsigned len); +#endif + /* * movsl can be slow when source and dest are not both 8-byte aligned */ @@ -494,6 +487,14 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define unsafe_get_user(x, ptr, err_label) \ +do { \ + __inttype(*(ptr)) __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ +} while (0) +#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ @@ -502,6 +503,7 @@ do { \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* * We want the unsafe accessors to always be inlined and use @@ -528,6 +530,11 @@ do { \ #define HAVE_GET_KERNEL_NOFAULT +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_kernel_nofault(dst, src, type, err_label) \ + __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ + sizeof(type), err_label) +#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ @@ -537,6 +544,7 @@ do { \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) +#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index bc10e3dc64fe..e7265a552f4f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -47,22 +47,6 @@ copy_user_generic(void *to, const void *from, unsigned len) } static __always_inline __must_check unsigned long -copy_to_user_mcsafe(void *to, const void *from, unsigned len) -{ - unsigned long ret; - - __uaccess_begin(); - /* - * Note, __memcpy_mcsafe() is explicitly used since it can - * handle exceptions / faults. memcpy_mcsafe() may fall back to - * memcpy() which lacks this handling. - */ - ret = __memcpy_mcsafe(to, from, len); - __uaccess_end(); - return ret; -} - -static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { return copy_user_generic(dst, (__force void *)src, size); @@ -102,8 +86,4 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } - -unsigned long -mcsafe_handle_tail(char *to, char *from, unsigned len); - #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 7d903fdb3f43..664d4610d700 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -1,51 +1,17 @@ #ifndef _ASM_X86_UNWIND_HINTS_H #define _ASM_X86_UNWIND_HINTS_H +#include <linux/objtool.h> + #include "orc_types.h" #ifdef __ASSEMBLY__ -/* - * In asm, there are two kinds of code: normal C-type callable functions and - * the rest. The normal callable functions can be called by other code, and - * don't do anything unusual with the stack. Such normal callable functions - * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this - * category. In this case, no special debugging annotations are needed because - * objtool can automatically generate the ORC data for the ORC unwinder to read - * at runtime. - * - * Anything which doesn't fall into the above category, such as syscall and - * interrupt handlers, tends to not be called directly by other functions, and - * often does unusual non-C-function-type things with the stack pointer. Such - * code needs to be annotated such that objtool can understand it. The - * following CFI hint macros are for this type of code. - * - * These macros provide hints to objtool about the state of the stack at each - * instruction. Objtool starts from the hints and follows the code flow, - * making automatic CFI adjustments when it sees pushes and pops, filling out - * the debuginfo as necessary. It will also warn if it sees any - * inconsistencies. - */ -.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0 -#ifdef CONFIG_STACK_VALIDATION -.Lunwind_hint_ip_\@: - .pushsection .discard.unwind_hints - /* struct unwind_hint */ - .long .Lunwind_hint_ip_\@ - . - .short \sp_offset - .byte \sp_reg - .byte \type - .byte \end - .balign 4 - .popsection -#endif -.endm - .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1 + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 .endm -.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 .if \base == %rsp .if \indirect .set sp_reg, ORC_REG_SP_INDIRECT @@ -66,24 +32,24 @@ .set sp_offset, \offset - .if \iret - .set type, ORC_TYPE_REGS_IRET + .if \partial + .set type, UNWIND_HINT_TYPE_REGS_PARTIAL .elseif \extra == 0 - .set type, ORC_TYPE_REGS_IRET + .set type, UNWIND_HINT_TYPE_REGS_PARTIAL .set sp_offset, \offset + (16*8) .else - .set type, ORC_TYPE_REGS + .set type, UNWIND_HINT_TYPE_REGS .endif UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type .endm .macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 - UNWIND_HINT_REGS base=\base offset=\offset iret=1 + UNWIND_HINT_REGS base=\base offset=\offset partial=1 .endm .macro UNWIND_HINT_FUNC sp_offset=8 - UNWIND_HINT sp_offset=\sp_offset + UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL .endm /* @@ -92,7 +58,7 @@ * initial_func_cfi. */ .macro UNWIND_HINT_RET_OFFSET sp_offset=8 - UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset + UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 70050d0136c3..08b3d810dfba 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -5,8 +5,9 @@ /* * UV BIOS layer definitions. * - * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson <rja@sgi.com> + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2007-2017 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/rtc.h> @@ -71,6 +72,11 @@ struct uv_gam_range_entry { u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ }; +#define UV_AT_SIZE 8 /* 7 character arch type + NULL char */ +struct uv_arch_type_entry { + char archtype[UV_AT_SIZE]; +}; + #define UV_SYSTAB_SIG "UVST" #define UV_SYSTAB_VERSION_1 1 /* UV2/3 BIOS version */ #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ @@ -79,10 +85,14 @@ struct uv_gam_range_entry { #define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ #define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 +#define UV_SYSTAB_VERSION_UV5 0x500 /* UV5 GAM base version */ +#define UV_SYSTAB_VERSION_UV5_LATEST UV_SYSTAB_VERSION_UV5 + #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ #define UV_SYSTAB_TYPE_GAM_RNG_TBL 2 /* GAM entry table */ -#define UV_SYSTAB_TYPE_MAX 3 +#define UV_SYSTAB_TYPE_ARCH_TYPE 3 /* UV arch type */ +#define UV_SYSTAB_TYPE_MAX 4 /* * The UV system table describes specific firmware @@ -133,6 +143,7 @@ extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern int uv_bios_init(void); +extern unsigned long get_uv_systab_phys(bool msg); extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index e48aea9ba47d..172d3e4a9e4b 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -35,10 +35,8 @@ extern int is_uv_hubbed(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); -extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); -#else /* X86_UV */ +#else /* !X86_UV */ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h deleted file mode 100644 index cd24804955d7..000000000000 --- a/arch/x86/include/asm/uv/uv_bau.h +++ /dev/null @@ -1,755 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * SGI UV Broadcast Assist Unit definitions - * - * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved. - */ - -#ifndef _ASM_X86_UV_UV_BAU_H -#define _ASM_X86_UV_UV_BAU_H - -#include <linux/bitmap.h> -#include <asm/idtentry.h> - -#define BITSPERBYTE 8 - -/* - * Broadcast Assist Unit messaging structures - * - * Selective Broadcast activations are induced by software action - * specifying a particular 8-descriptor "set" via a 6-bit index written - * to an MMR. - * Thus there are 64 unique 512-byte sets of SB descriptors - one set for - * each 6-bit index value. These descriptor sets are mapped in sequence - * starting with set 0 located at the address specified in the - * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, - * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. - * - * We will use one set for sending BAU messages from each of the - * cpu's on the uvhub. - * - * TLB shootdown will use the first of the 8 descriptors of each set. - * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). - */ - -#define MAX_CPUS_PER_UVHUB 128 -#define MAX_CPUS_PER_SOCKET 64 -#define ADP_SZ 64 /* hardware-provided max. */ -#define UV_CPUS_PER_AS 32 /* hardware-provided max. */ -#define ITEMS_PER_DESC 8 -/* the 'throttle' to prevent the hardware stay-busy bug */ -#define MAX_BAU_CONCURRENT 3 -#define UV_ACT_STATUS_MASK 0x3 -#define UV_ACT_STATUS_SIZE 2 -#define UV_DISTRIBUTION_SIZE 256 -#define UV_SW_ACK_NPENDING 8 -#define UV_NET_ENDPOINT_INTD 0x28 -#define UV_PAYLOADQ_GNODE_SHIFT 49 -#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" -#define UV_BAU_BASENAME "sgi_uv/bau_tunables" -#define UV_BAU_TUNABLES_DIR "sgi_uv" -#define UV_BAU_TUNABLES_FILE "bau_tunables" -#define WHITESPACE " \t\n" -#define cpubit_isset(cpu, bau_local_cpumask) \ - test_bit((cpu), (bau_local_cpumask).bits) - -/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ -/* - * UV2: Bit 19 selects between - * (0): 10 microsecond timebase and - * (1): 80 microseconds - * we're using 560us - */ -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) -/* assuming UV3 is the same */ - -#define BAU_MISC_CONTROL_MULT_MASK 3 - -#define UVH_AGING_PRESCALE_SEL 0x000000b000UL -/* [30:28] URGENCY_7 an index into a table of times */ -#define BAU_URGENCY_7_SHIFT 28 -#define BAU_URGENCY_7_MASK 7 - -#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL -/* [45:40] BAU - BAU transaction timeout select - a multiplier */ -#define BAU_TRANS_SHIFT 40 -#define BAU_TRANS_MASK 0x3f - -/* - * shorten some awkward names - */ -#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT -#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT -#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT -#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD -#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT -#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT -#define write_gmmr uv_write_global_mmr64 -#define write_lmmr uv_write_local_mmr -#define read_lmmr uv_read_local_mmr -#define read_gmmr uv_read_global_mmr64 - -/* - * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 - */ -#define DS_IDLE 0 -#define DS_ACTIVE 1 -#define DS_DESTINATION_TIMEOUT 2 -#define DS_SOURCE_TIMEOUT 3 -/* - * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2 - * values 1 and 3 will not occur - * Decoded meaning ERROR BUSY AUX ERR - * ------------------------------- ---- ----- ------- - * IDLE 0 0 0 - * BUSY (active) 0 1 0 - * SW Ack Timeout (destination) 1 0 0 - * SW Ack INTD rejected (strong NACK) 1 0 1 - * Source Side Time Out Detected 1 1 0 - * Destination Side PUT Failed 1 1 1 - */ -#define UV2H_DESC_IDLE 0 -#define UV2H_DESC_BUSY 2 -#define UV2H_DESC_DEST_TIMEOUT 4 -#define UV2H_DESC_DEST_STRONG_NACK 5 -#define UV2H_DESC_SOURCE_TIMEOUT 6 -#define UV2H_DESC_DEST_PUT_ERR 7 - -/* - * delay for 'plugged' timeout retries, in microseconds - */ -#define PLUGGED_DELAY 10 - -/* - * threshholds at which to use IPI to free resources - */ -/* after this # consecutive 'plugged' timeouts, use IPI to release resources */ -#define PLUGSB4RESET 100 -/* after this many consecutive timeouts, use IPI to release resources */ -#define TIMEOUTSB4RESET 1 -/* at this number uses of IPI to release resources, giveup the request */ -#define IPI_RESET_LIMIT 1 -/* after this # consecutive successes, bump up the throttle if it was lowered */ -#define COMPLETE_THRESHOLD 5 -/* after this # of giveups (fall back to kernel IPI's) disable the use of - the BAU for a period of time */ -#define GIVEUP_LIMIT 100 - -#define UV_LB_SUBNODEID 0x10 - -#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT -#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK -/* 4 bits of software ack period */ -#define UV2_ACK_MASK 0x7UL -#define UV2_ACK_UNITS_SHFT 3 -#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT - -/* - * number of entries in the destination side payload queue - */ -#define DEST_Q_SIZE 20 -/* - * number of destination side software ack resources - */ -#define DEST_NUM_RESOURCES 8 -/* - * completion statuses for sending a TLB flush message - */ -#define FLUSH_RETRY_PLUGGED 1 -#define FLUSH_RETRY_TIMEOUT 2 -#define FLUSH_GIVEUP 3 -#define FLUSH_COMPLETE 4 - -/* - * tuning the action when the numalink network is extremely delayed - */ -#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in - microseconds */ -#define CONGESTED_REPS 10 /* long delays averaged over - this many broadcasts */ -#define DISABLED_PERIOD 10 /* time for the bau to be - disabled, in seconds */ -/* see msg_type: */ -#define MSG_NOOP 0 -#define MSG_REGULAR 1 -#define MSG_RETRY 2 - -#define BAU_DESC_QUALIFIER 0x534749 - -enum uv_bau_version { - UV_BAU_V2 = 2, - UV_BAU_V3, - UV_BAU_V4, -}; - -/* - * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) - * If the 'multilevel' flag in the header portion of the descriptor - * has been set to 0, then endpoint multi-unicast mode is selected. - * The distribution specification (32 bytes) is interpreted as a 256-bit - * distribution vector. Adjacent bits correspond to consecutive even numbered - * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nasid' field of the header corresponds to the - * destination nodeID associated with that specified bit. - */ -struct pnmask { - unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; -}; - -/* - * mask of cpu's on a uvhub - * (during initialization we need to check that unsigned long has - * enough bits for max. cpu's per uvhub) - */ -struct bau_local_cpumask { - unsigned long bits; -}; - -/* - * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor) - * only 12 bytes (96 bits) of the payload area are usable. - * An additional 3 bytes (bits 27:4) of the header address are carried - * to the next bytes of the destination payload queue. - * And an additional 2 bytes of the header Suppl_A field are also - * carried to the destination payload queue. - * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte) - * of the destination payload queue, which is written by the hardware - * with the s/w ack resource bit vector. - * [ effective message contents (16 bytes (128 bits) maximum), not counting - * the s/w ack bit vector ] - */ - -/** - * struct uv2_3_bau_msg_payload - defines payload for INTD transactions - * @address: Signifies a page or all TLB's of the cpu - * @sending_cpu: CPU from which the message originates - * @acknowledge_count: CPUs on the destination Hub that received the interrupt - */ -struct uv2_3_bau_msg_payload { - u64 address; - u16 sending_cpu; - u16 acknowledge_count; -}; - -/** - * struct uv4_bau_msg_payload - defines payload for INTD transactions - * @address: Signifies a page or all TLB's of the cpu - * @sending_cpu: CPU from which the message originates - * @acknowledge_count: CPUs on the destination Hub that received the interrupt - * @qualifier: Set by source to verify origin of INTD broadcast - */ -struct uv4_bau_msg_payload { - u64 address; - u16 sending_cpu; - u16 acknowledge_count; - u32 reserved:8; - u32 qualifier:24; -}; - -/* - * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) - * see figure 9-2 of harp_sys.pdf - * assuming UV3 is the same - */ -struct uv2_3_bau_msg_header { - unsigned int base_dest_nasid:15; /* nasid of the first bit */ - /* bits 14:0 */ /* in uvhub map */ - unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ - /* bits 19:15 */ - unsigned int rsvd_1:1; /* must be zero */ - /* bit 20 */ - /* Address bits 59:21 */ - /* bits 25:2 of address (44:21) are payload */ - /* these next 24 bits become bytes 12-14 of msg */ - /* bits 28:21 land in byte 12 */ - unsigned int replied_to:1; /* sent as 0 by the source to - byte 12 */ - /* bit 21 */ - unsigned int msg_type:3; /* software type of the - message */ - /* bits 24:22 */ - unsigned int canceled:1; /* message canceled, resource - is to be freed*/ - /* bit 25 */ - unsigned int payload_1:3; /* not currently used */ - /* bits 28:26 */ - - /* bits 36:29 land in byte 13 */ - unsigned int payload_2a:3; /* not currently used */ - unsigned int payload_2b:5; /* not currently used */ - /* bits 36:29 */ - - /* bits 44:37 land in byte 14 */ - unsigned int payload_3:8; /* not currently used */ - /* bits 44:37 */ - - unsigned int rsvd_2:7; /* reserved */ - /* bits 51:45 */ - unsigned int swack_flag:1; /* software acknowledge flag */ - /* bit 52 */ - unsigned int rsvd_3a:3; /* must be zero */ - unsigned int rsvd_3b:8; /* must be zero */ - unsigned int rsvd_3c:8; /* must be zero */ - unsigned int rsvd_3d:3; /* must be zero */ - /* bits 74:53 */ - unsigned int fairness:3; /* usually zero */ - /* bits 77:75 */ - - unsigned int sequence:16; /* message sequence number */ - /* bits 93:78 Suppl_A */ - unsigned int chaining:1; /* next descriptor is part of - this activation*/ - /* bit 94 */ - unsigned int multilevel:1; /* multi-level multicast - format */ - /* bit 95 */ - unsigned int rsvd_4:24; /* ordered / source node / - source subnode / aging - must be zero */ - /* bits 119:96 */ - unsigned int command:8; /* message type */ - /* bits 127:120 */ -}; - -/* - * The activation descriptor: - * The format of the message to send, plus all accompanying control - * Should be 64 bytes - */ -struct bau_desc { - struct pnmask distribution; - /* - * message template, consisting of header and payload: - */ - union bau_msg_header { - struct uv2_3_bau_msg_header uv2_3_hdr; - } header; - - union bau_payload_header { - struct uv2_3_bau_msg_payload uv2_3; - struct uv4_bau_msg_payload uv4; - } payload; -}; -/* UV2: - * -payload-- ---------header------ - * bytes 0-11 bits 70-78 bits 21-44 - * A B (2) C (3) - * - * A/B/C are moved to: - * A C B - * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) - * ------------payload queue----------- - */ - -/* - * The payload queue on the destination side is an array of these. - * With BAU_MISC_CONTROL set for software acknowledge mode, the messages - * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 - * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) - * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from - * swack_vec and payload_2) - * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software - * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload - * operation." - */ -struct bau_pq_entry { - unsigned long address; /* signifies a page or all TLB's - of the cpu */ - /* 64 bits, bytes 0-7 */ - unsigned short sending_cpu; /* cpu that sent the message */ - /* 16 bits, bytes 8-9 */ - unsigned short acknowledge_count; /* filled in by destination */ - /* 16 bits, bytes 10-11 */ - /* these next 3 bytes come from bits 58-81 of the message header */ - unsigned short replied_to:1; /* sent as 0 by the source */ - unsigned short msg_type:3; /* software message type */ - unsigned short canceled:1; /* sent as 0 by the source */ - unsigned short unused1:3; /* not currently using */ - /* byte 12 */ - unsigned char unused2a; /* not currently using */ - /* byte 13 */ - unsigned char unused2; /* not currently using */ - /* byte 14 */ - unsigned char swack_vec; /* filled in by the hardware */ - /* byte 15 (bits 127:120) */ - unsigned short sequence; /* message sequence number */ - /* bytes 16-17 */ - unsigned char unused4[2]; /* not currently using bytes 18-19 */ - /* bytes 18-19 */ - int number_of_cpus; /* filled in at destination */ - /* 32 bits, bytes 20-23 (aligned) */ - unsigned char unused5[8]; /* not using */ - /* bytes 24-31 */ -}; - -struct msg_desc { - struct bau_pq_entry *msg; - int msg_slot; - struct bau_pq_entry *queue_first; - struct bau_pq_entry *queue_last; -}; - -struct reset_args { - int sender; -}; - -/* - * This structure is allocated per_cpu for UV TLB shootdown statistics. - */ -struct ptc_stats { - /* sender statistics */ - unsigned long s_giveup; /* number of fall backs to - IPI-style flushes */ - unsigned long s_requestor; /* number of shootdown - requests */ - unsigned long s_stimeout; /* source side timeouts */ - unsigned long s_dtimeout; /* destination side timeouts */ - unsigned long s_strongnacks; /* number of strong nack's */ - unsigned long s_time; /* time spent in sending side */ - unsigned long s_retriesok; /* successful retries */ - unsigned long s_ntargcpu; /* total number of cpu's - targeted */ - unsigned long s_ntargself; /* times the sending cpu was - targeted */ - unsigned long s_ntarglocals; /* targets of cpus on the local - blade */ - unsigned long s_ntargremotes; /* targets of cpus on remote - blades */ - unsigned long s_ntarglocaluvhub; /* targets of the local hub */ - unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ - unsigned long s_ntarguvhub; /* total number of uvhubs - targeted */ - unsigned long s_ntarguvhub16; /* number of times target - hubs >= 16*/ - unsigned long s_ntarguvhub8; /* number of times target - hubs >= 8 */ - unsigned long s_ntarguvhub4; /* number of times target - hubs >= 4 */ - unsigned long s_ntarguvhub2; /* number of times target - hubs >= 2 */ - unsigned long s_ntarguvhub1; /* number of times target - hubs == 1 */ - unsigned long s_resets_plug; /* ipi-style resets from plug - state */ - unsigned long s_resets_timeout; /* ipi-style resets from - timeouts */ - unsigned long s_busy; /* status stayed busy past - s/w timer */ - unsigned long s_throttles; /* waits in throttle */ - unsigned long s_retry_messages; /* retry broadcasts */ - unsigned long s_bau_reenabled; /* for bau enable/disable */ - unsigned long s_bau_disabled; /* for bau enable/disable */ - unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ - unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ - unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ - unsigned long s_overipilimit; /* over the ipi reset limit */ - unsigned long s_giveuplimit; /* disables, over giveup limit*/ - unsigned long s_enters; /* entries to the driver */ - unsigned long s_ipifordisabled; /* fall back to IPI; disabled */ - unsigned long s_plugged; /* plugged by h/w bug*/ - unsigned long s_congested; /* giveup on long wait */ - /* destination statistics */ - unsigned long d_alltlb; /* times all tlb's on this - cpu were flushed */ - unsigned long d_onetlb; /* times just one tlb on this - cpu was flushed */ - unsigned long d_multmsg; /* interrupts with multiple - messages */ - unsigned long d_nomsg; /* interrupts with no message */ - unsigned long d_time; /* time spent on destination - side */ - unsigned long d_requestee; /* number of messages - processed */ - unsigned long d_retries; /* number of retry messages - processed */ - unsigned long d_canceled; /* number of messages canceled - by retries */ - unsigned long d_nocanceled; /* retries that found nothing - to cancel */ - unsigned long d_resets; /* number of ipi-style requests - processed */ - unsigned long d_rcanceled; /* number of messages canceled - by resets */ -}; - -struct tunables { - int *tunp; - int deflt; -}; - -struct hub_and_pnode { - short uvhub; - short pnode; -}; - -struct socket_desc { - short num_cpus; - short cpu_number[MAX_CPUS_PER_SOCKET]; -}; - -struct uvhub_desc { - unsigned short socket_mask; - short num_cpus; - short uvhub; - short pnode; - struct socket_desc socket[2]; -}; - -/** - * struct bau_control - * @status_mmr: location of status mmr, determined by uvhub_cpu - * @status_index: index of ERR|BUSY bits in status mmr, determined by uvhub_cpu - * - * Per-cpu control struct containing CPU topology information and BAU tuneables. - */ -struct bau_control { - struct bau_desc *descriptor_base; - struct bau_pq_entry *queue_first; - struct bau_pq_entry *queue_last; - struct bau_pq_entry *bau_msg_head; - struct bau_control *uvhub_master; - struct bau_control *socket_master; - struct ptc_stats *statp; - cpumask_t *cpumask; - unsigned long timeout_interval; - unsigned long set_bau_on_time; - atomic_t active_descriptor_count; - int plugged_tries; - int timeout_tries; - int ipi_attempts; - int conseccompletes; - u64 status_mmr; - int status_index; - bool nobau; - short baudisabled; - short cpu; - short osnode; - short uvhub_cpu; - short uvhub; - short uvhub_version; - short cpus_in_socket; - short cpus_in_uvhub; - short partition_base_pnode; - short busy; /* all were busy (war) */ - unsigned short message_number; - unsigned short uvhub_quiesce; - short socket_acknowledge_count[DEST_Q_SIZE]; - cycles_t send_message; - cycles_t period_end; - cycles_t period_time; - spinlock_t uvhub_lock; - spinlock_t queue_lock; - spinlock_t disable_lock; - /* tunables */ - int max_concurr; - int max_concurr_const; - int plugged_delay; - int plugsb4reset; - int timeoutsb4reset; - int ipi_reset_limit; - int complete_threshold; - int cong_response_us; - int cong_reps; - cycles_t disabled_period; - int period_giveups; - int giveup_limit; - long period_requests; - struct hub_and_pnode *thp; -}; - -/* Abstracted BAU functions */ -struct bau_operations { - unsigned long (*read_l_sw_ack)(void); - unsigned long (*read_g_sw_ack)(int pnode); - unsigned long (*bau_gpa_to_offset)(unsigned long vaddr); - void (*write_l_sw_ack)(unsigned long mmr); - void (*write_g_sw_ack)(int pnode, unsigned long mmr); - void (*write_payload_first)(int pnode, unsigned long mmr); - void (*write_payload_last)(int pnode, unsigned long mmr); - int (*wait_completion)(struct bau_desc*, - struct bau_control*, long try); -}; - -static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); -} - -static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image); -} - -static inline void write_mmr_activation(unsigned long index) -{ - write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); -} - -static inline void write_gmmr_activation(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); -} - -static inline void write_mmr_proc_payload_first(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_FIRST, mmr_image); -} - -static inline void write_mmr_proc_payload_last(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_LAST, mmr_image); -} - -static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); -} - -static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image); -} - -static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image); -} - -static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image) -{ - write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); -} - -static inline unsigned long read_mmr_misc_control(int pnode) -{ - return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL); -} - -static inline void write_mmr_sw_ack(unsigned long mr) -{ - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); -} - -static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) -{ - write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); -} - -static inline unsigned long read_mmr_sw_ack(void) -{ - return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); -} - -static inline unsigned long read_gmmr_sw_ack(int pnode) -{ - return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); -} - -static inline void write_mmr_proc_sw_ack(unsigned long mr) -{ - uv_write_local_mmr(UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr); -} - -static inline void write_gmmr_proc_sw_ack(int pnode, unsigned long mr) -{ - write_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr); -} - -static inline unsigned long read_mmr_proc_sw_ack(void) -{ - return read_lmmr(UV4H_LB_PROC_INTD_SOFT_ACK_PENDING); -} - -static inline unsigned long read_gmmr_proc_sw_ack(int pnode) -{ - return read_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_PENDING); -} - -static inline void write_mmr_data_config(int pnode, unsigned long mr) -{ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); -} - -static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp) -{ - return constant_test_bit(uvhub, &dstp->bits[0]); -} -static inline void bau_uvhub_set(int pnode, struct pnmask *dstp) -{ - __set_bit(pnode, &dstp->bits[0]); -} -static inline void bau_uvhubs_clear(struct pnmask *dstp, - int nbits) -{ - bitmap_zero(&dstp->bits[0], nbits); -} -static inline int bau_uvhub_weight(struct pnmask *dstp) -{ - return bitmap_weight((unsigned long *)&dstp->bits[0], - UV_DISTRIBUTION_SIZE); -} - -static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) -{ - bitmap_zero(&dstp->bits, nbits); -} - -struct atomic_short { - short counter; -}; - -/* - * atomic_read_short - read a short atomic variable - * @v: pointer of type atomic_short - * - * Atomically reads the value of @v. - */ -static inline int atomic_read_short(const struct atomic_short *v) -{ - return v->counter; -} - -/* - * atom_asr - add and return a short int - * @i: short value to add - * @v: pointer of type atomic_short - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline int atom_asr(short i, struct atomic_short *v) -{ - short __i = i; - asm volatile(LOCK_PREFIX "xaddw %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -} - -/* - * conditionally add 1 to *v, unless *v is >= u - * return 0 if we cannot add 1 to *v because it is >= u - * return 1 if we can add 1 to *v because it is < u - * the add is atomic - * - * This is close to atomic_add_unless(), but this allows the 'u' value - * to be lowered below the current 'v'. atomic_add_unless can only stop - * on equal. - */ -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) -{ - spin_lock(lock); - if (atomic_read(v) >= u) { - spin_unlock(lock); - return 0; - } - atomic_inc(v); - spin_unlock(lock); - return 1; -} - -void uv_bau_message_interrupt(struct pt_regs *regs); - -#endif /* _ASM_X86_UV_UV_BAU_H */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 100d66806503..5002f52be332 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -5,6 +5,7 @@ * * SGI UV architectural definitions * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ @@ -129,17 +130,6 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) -/* System Controller Interface Reg info */ -struct uv_scir_s { - struct timer_list timer; - unsigned long offset; - unsigned long last; - unsigned long idle_on; - unsigned long idle_off; - unsigned char state; - unsigned char enabled; -}; - /* GAM (globally addressed memory) range table */ struct uv_gam_range_s { u32 limit; /* PA bits 56:26 (GAM_RANGE_SHFT) */ @@ -155,6 +145,8 @@ struct uv_gam_range_s { * available in the L3 cache on the cpu socket for the node. */ struct uv_hub_info_s { + unsigned int hub_type; + unsigned char hub_revision; unsigned long global_mmr_base; unsigned long global_mmr_shift; unsigned long gpa_mask; @@ -167,9 +159,9 @@ struct uv_hub_info_s { unsigned char m_val; unsigned char n_val; unsigned char gr_table_len; - unsigned char hub_revision; unsigned char apic_pnode_shift; unsigned char gpa_shift; + unsigned char nasid_shift; unsigned char m_shift; unsigned char n_lshift; unsigned int gnode_extra; @@ -191,16 +183,13 @@ struct uv_hub_info_s { struct uv_cpu_info_s { void *p_uv_hub_info; unsigned char blade_cpu_id; - struct uv_scir_s scir; + void *reserved; }; DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); #define uv_cpu_info this_cpu_ptr(&__uv_cpu_info) #define uv_cpu_info_per(cpu) (&per_cpu(__uv_cpu_info, cpu)) -#define uv_scir_info (&uv_cpu_info->scir) -#define uv_cpu_scir_info(cpu) (&uv_cpu_info_per(cpu)->scir) - /* Node specific hub common info struct */ extern void **__uv_hub_info_list; static inline struct uv_hub_info_s *uv_hub_info_list(int node) @@ -219,6 +208,17 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; } +static inline int uv_hub_type(void) +{ + return uv_hub_info->hub_type; +} + +static inline __init void uv_hub_type_set(int uvmask) +{ + uv_hub_info->hub_type = uvmask; +} + + /* * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in @@ -228,39 +228,31 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) #define UV3_HUB_REVISION_BASE 5 #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ +#define UV5_HUB_REVISION_BASE 9 -static inline int is_uv2_hub(void) -{ - return is_uv_hubbed(uv(2)); -} - -static inline int is_uv3_hub(void) -{ - return is_uv_hubbed(uv(3)); -} +static inline int is_uv(int uvmask) { return uv_hub_type() & uvmask; } +static inline int is_uv1_hub(void) { return 0; } +static inline int is_uv2_hub(void) { return is_uv(UV2); } +static inline int is_uv3_hub(void) { return is_uv(UV3); } +static inline int is_uv4a_hub(void) { return is_uv(UV4A); } +static inline int is_uv4_hub(void) { return is_uv(UV4); } +static inline int is_uv5_hub(void) { return is_uv(UV5); } -/* First test "is UV4A", then "is UV4" */ -static inline int is_uv4a_hub(void) -{ - if (is_uv_hubbed(uv(4))) - return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); - return 0; -} +/* + * UV4A is a revision of UV4. So on UV4A, both is_uv4_hub() and + * is_uv4a_hub() return true, While on UV4, only is_uv4_hub() + * returns true. So to get true results, first test if is UV4A, + * then test if is UV4. + */ -static inline int is_uv4_hub(void) -{ - return is_uv_hubbed(uv(4)); -} +/* UVX class: UV2,3,4 */ +static inline int is_uvx_hub(void) { return is_uv(UVX); } -static inline int is_uvx_hub(void) -{ - return (is_uv_hubbed(-2) >= uv(2)); -} +/* UVY class: UV5,..? */ +static inline int is_uvy_hub(void) { return is_uv(UVY); } -static inline int is_uv_hub(void) -{ - return is_uvx_hub(); -} +/* Any UV Hubbed System */ +static inline int is_uv_hub(void) { return is_uv(UV_ANY); } union uvh_apicid { unsigned long v; @@ -282,9 +274,11 @@ union uvh_apicid { * g - GNODE (full 15-bit global nasid, right shifted 1) * p - PNODE (local part of nsids, right shifted 1) */ -#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) +#define UV_NASID_TO_PNODE(n) \ + (((n) >> uv_hub_info->nasid_shift) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) -#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) +#define UV_PNODE_TO_NASID(p) \ + (UV_PNODE_TO_GNODE(p) << uv_hub_info->nasid_shift) #define UV2_LOCAL_MMR_BASE 0xfa000000UL #define UV2_GLOBAL_MMR32_BASE 0xfc000000UL @@ -297,29 +291,42 @@ union uvh_apicid { #define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) #define UV4_LOCAL_MMR_BASE 0xfa000000UL -#define UV4_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV4_GLOBAL_MMR32_BASE 0 #define UV4_LOCAL_MMR_SIZE (32UL * 1024 * 1024) -#define UV4_GLOBAL_MMR32_SIZE (16UL * 1024 * 1024) +#define UV4_GLOBAL_MMR32_SIZE 0 + +#define UV5_LOCAL_MMR_BASE 0xfa000000UL +#define UV5_GLOBAL_MMR32_BASE 0 +#define UV5_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV5_GLOBAL_MMR32_SIZE 0 #define UV_LOCAL_MMR_BASE ( \ - is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ - is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \ - /*is_uv4_hub*/ UV4_LOCAL_MMR_BASE) + is_uv(UV2) ? UV2_LOCAL_MMR_BASE : \ + is_uv(UV3) ? UV3_LOCAL_MMR_BASE : \ + is_uv(UV4) ? UV4_LOCAL_MMR_BASE : \ + is_uv(UV5) ? UV5_LOCAL_MMR_BASE : \ + 0) #define UV_GLOBAL_MMR32_BASE ( \ - is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \ - is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \ - /*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE) + is_uv(UV2) ? UV2_GLOBAL_MMR32_BASE : \ + is_uv(UV3) ? UV3_GLOBAL_MMR32_BASE : \ + is_uv(UV4) ? UV4_GLOBAL_MMR32_BASE : \ + is_uv(UV5) ? UV5_GLOBAL_MMR32_BASE : \ + 0) #define UV_LOCAL_MMR_SIZE ( \ - is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ - is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \ - /*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE) + is_uv(UV2) ? UV2_LOCAL_MMR_SIZE : \ + is_uv(UV3) ? UV3_LOCAL_MMR_SIZE : \ + is_uv(UV4) ? UV4_LOCAL_MMR_SIZE : \ + is_uv(UV5) ? UV5_LOCAL_MMR_SIZE : \ + 0) #define UV_GLOBAL_MMR32_SIZE ( \ - is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \ - is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \ - /*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE) + is_uv(UV2) ? UV2_GLOBAL_MMR32_SIZE : \ + is_uv(UV3) ? UV3_GLOBAL_MMR32_SIZE : \ + is_uv(UV4) ? UV4_GLOBAL_MMR32_SIZE : \ + is_uv(UV5) ? UV5_GLOBAL_MMR32_SIZE : \ + 0) #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) @@ -720,7 +727,7 @@ extern void uv_nmi_setup_hubless(void); #define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */ #define UVH_TSC_SYNC_MASK 3 /* 0011 */ #define UVH_TSC_SYNC_VALID 3 /* 0011 */ -#define UVH_TSC_SYNC_INVALID 2 /* 0010 */ +#define UVH_TSC_SYNC_UNKNOWN 0 /* 0000 */ /* BMC sets a bit this MMR non-zero before sending an NMI */ #define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR @@ -728,19 +735,6 @@ extern void uv_nmi_setup_hubless(void); #define UVH_NMI_MMR_SHIFT 63 #define UVH_NMI_MMR_TYPE "SCRATCH5" -/* Newer SMM NMI handler, not present in all systems */ -#define UVH_NMI_MMRX UVH_EVENT_OCCURRED0 -#define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS -#define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT -#define UVH_NMI_MMRX_TYPE "EXTIO_INT0" - -/* Non-zero indicates newer SMM NMI handler present */ -#define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST - -/* Indicates to BIOS that we want to use the newer SMM NMI handler */ -#define UVH_NMI_MMRX_REQ UVH_BIOS_KERNEL_MMR_ALIAS_2 -#define UVH_NMI_MMRX_REQ_SHIFT 62 - struct uv_hub_nmi_s { raw_spinlock_t nmi_lock; atomic_t in_nmi; /* flag this node in UV NMI IRQ */ @@ -772,29 +766,6 @@ DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); #define UV_NMI_STATE_DUMP 2 #define UV_NMI_STATE_DUMP_DONE 3 -/* Update SCIR state */ -static inline void uv_set_scir_bits(unsigned char value) -{ - if (uv_scir_info->state != value) { - uv_scir_info->state = value; - uv_write_local_mmr8(uv_scir_info->offset, value); - } -} - -static inline unsigned long uv_scir_offset(int apicid) -{ - return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); -} - -static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) -{ - if (uv_cpu_scir_info(cpu)->state != value) { - uv_write_global_mmr8(uv_cpu_to_pnode(cpu), - uv_cpu_scir_info(cpu)->offset, value); - uv_cpu_scir_info(cpu)->state = value; - } -} - /* * Get the minimum revision number of the hub chips within the partition. * (See UVx_HUB_REVISION_BASE above for specific values.) diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 775bf143a072..57fa67373262 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -3,8 +3,9 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * SGI UV MMR definitions + * HPE UV MMR definitions * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved. */ @@ -18,42 +19,43 @@ * grouped by architecture types. * * UVH - definitions common to all UV hub types. - * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4). - * UV2H - definitions specific to UV type 2 hub. - * UV3H - definitions specific to UV type 3 hub. + * UVXH - definitions common to UVX class (2, 3, 4). + * UVYH - definitions common to UVY class (5). + * UV5H - definitions specific to UV type 5 hub. + * UV4AH - definitions specific to UV type 4A hub. * UV4H - definitions specific to UV type 4 hub. - * - * So in general, MMR addresses and structures are identical on all hubs types. - * These MMRs are identified as: - * #define UVH_xxx <address> - * union uvh_xxx { - * unsigned long v; - * struct uvh_int_cmpd_s { - * } s; - * }; + * UV3H - definitions specific to UV type 3 hub. + * UV2H - definitions specific to UV type 2 hub. * * If the MMR exists on all hub types but have different addresses, - * use a conditional operator to define the value at runtime. - * #define UV2Hxxx b - * #define UV3Hxxx c - * #define UV4Hxxx d - * #define UV4AHxxx e - * #define UVHxxx (is_uv2_hub() ? UV2Hxxx : - * (is_uv3_hub() ? UV3Hxxx : - * (is_uv4a_hub() ? UV4AHxxx : - * UV4Hxxx)) + * use a conditional operator to define the value at runtime. Any + * that are not defined are blank. + * (UV4A variations only generated if different from uv4) + * #define UVHxxx ( + * is_uv(UV5) ? UV5Hxxx value : + * is_uv(UV4A) ? UV4AHxxx value : + * is_uv(UV4) ? UV4Hxxx value : + * is_uv(UV3) ? UV3Hxxx value : + * is_uv(UV2) ? UV2Hxxx value : + * <ucv> or <undef value>) + * + * Class UVX has UVs (2|3|4|4A). + * Class UVY has UVs (5). * * union uvh_xxx { * unsigned long v; * struct uvh_xxx_s { # Common fields only * } s; - * struct uv2h_xxx_s { # Full UV2 definition (*) - * } s2; - * struct uv3h_xxx_s { # Full UV3 definition (*) - * } s3; - * (NOTE: No struct uv4ah_xxx_s members exist) + * struct uv5h_xxx_s { # Full UV5 definition (*) + * } s5; + * struct uv4ah_xxx_s { # Full UV4A definition (*) + * } s4a; * struct uv4h_xxx_s { # Full UV4 definition (*) * } s4; + * struct uv3h_xxx_s { # Full UV3 definition (*) + * } s3; + * struct uv2h_xxx_s { # Full UV2 definition (*) + * } s2; * }; * (* - if present and different than the common struct) * @@ -62,429 +64,499 @@ * if the contents is the same for all hubs, only the "s" structure is * generated. * - * If the MMR exists on ONLY 1 type of hub, no generic definition is - * generated: - * #define UVnH_xxx <uvn address> - * union uvnh_xxx { - * unsigned long v; - * struct uvh_int_cmpd_s { - * } sn; - * }; - * - * (GEN Flags: mflags_opt= undefs=function UV234=UVXH) + * (GEN Flags: undefs=function) */ + /* UV bit masks */ +#define UV2 (1 << 0) +#define UV3 (1 << 1) +#define UV4 (1 << 2) +#define UV4A (1 << 3) +#define UV5 (1 << 4) +#define UVX (UV2|UV3|UV4) +#define UVY (UV5) +#define UV_ANY (~0) + + + + #define UV_MMR_ENABLE (1UL << 63) +#define UV1_HUB_PART_NUMBER 0x88a5 #define UV2_HUB_PART_NUMBER 0x8eb8 #define UV2_HUB_PART_NUMBER_X 0x1111 #define UV3_HUB_PART_NUMBER 0x9578 #define UV3_HUB_PART_NUMBER_X 0x4321 #define UV4_HUB_PART_NUMBER 0x99a1 +#define UV5_HUB_PART_NUMBER 0xa171 /* Error function to catch undefined references */ extern unsigned long uv_undefined(char *str); /* ========================================================================= */ -/* UVH_BAU_DATA_BROADCAST */ -/* ========================================================================= */ -#define UVH_BAU_DATA_BROADCAST 0x61688UL - -#define UV2H_BAU_DATA_BROADCAST_32 0x440 -#define UV3H_BAU_DATA_BROADCAST_32 0x440 -#define UV4H_BAU_DATA_BROADCAST_32 0x360 -#define UVH_BAU_DATA_BROADCAST_32 ( \ - is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 : \ - is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 : \ - /*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32) - -#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 -#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL - - -union uvh_bau_data_broadcast_u { - unsigned long v; - struct uvh_bau_data_broadcast_s { - unsigned long enable:1; /* RW */ - unsigned long rsvd_1_63:63; - } s; -}; - -/* ========================================================================= */ -/* UVH_BAU_DATA_CONFIG */ -/* ========================================================================= */ -#define UVH_BAU_DATA_CONFIG 0x61680UL - -#define UV2H_BAU_DATA_CONFIG_32 0x438 -#define UV3H_BAU_DATA_CONFIG_32 0x438 -#define UV4H_BAU_DATA_CONFIG_32 0x358 -#define UVH_BAU_DATA_CONFIG_32 ( \ - is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 : \ - is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 : \ - /*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32) - -#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 -#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 -#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 -#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 -#define UVH_BAU_DATA_CONFIG_P_SHFT 13 -#define UVH_BAU_DATA_CONFIG_T_SHFT 15 -#define UVH_BAU_DATA_CONFIG_M_SHFT 16 -#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 -#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL - - -union uvh_bau_data_config_u { - unsigned long v; - struct uvh_bau_data_config_s { - unsigned long vector_:8; /* RW */ - unsigned long dm:3; /* RW */ - unsigned long destmode:1; /* RW */ - unsigned long status:1; /* RO */ - unsigned long p:1; /* RO */ - unsigned long rsvd_14:1; - unsigned long t:1; /* RO */ - unsigned long m:1; /* RW */ - unsigned long rsvd_17_31:15; - unsigned long apic_id:32; /* RW */ - } s; -}; - -/* ========================================================================= */ /* UVH_EVENT_OCCURRED0 */ /* ========================================================================= */ #define UVH_EVENT_OCCURRED0 0x70000UL -#define UVH_EVENT_OCCURRED0_32 0x5e8 +/* UVH common defines*/ #define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 #define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +/* UVXH common defines */ #define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 -#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 -#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 -#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 -#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 -#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 -#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 -#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 -#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 -#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 -#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 -#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 -#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 #define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL +#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL +#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL +#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 #define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL +#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 #define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL +#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 #define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL +#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 #define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL +#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 #define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL +#define UVXH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UVXH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL +#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL +#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL +#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL +#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 #define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL - -#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 -#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 -#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL -#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL -#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL - +/* UVYH common defines */ +#define UVYH_EVENT_OCCURRED0_KT_HCERR_SHFT 1 +#define UVYH_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL +#define UVYH_EVENT_OCCURRED0_RH0_HCERR_SHFT 2 +#define UVYH_EVENT_OCCURRED0_RH0_HCERR_MASK 0x0000000000000004UL +#define UVYH_EVENT_OCCURRED0_RH1_HCERR_SHFT 3 +#define UVYH_EVENT_OCCURRED0_RH1_HCERR_MASK 0x0000000000000008UL +#define UVYH_EVENT_OCCURRED0_LH0_HCERR_SHFT 4 +#define UVYH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000010UL +#define UVYH_EVENT_OCCURRED0_LH1_HCERR_SHFT 5 +#define UVYH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000020UL +#define UVYH_EVENT_OCCURRED0_LH2_HCERR_SHFT 6 +#define UVYH_EVENT_OCCURRED0_LH2_HCERR_MASK 0x0000000000000040UL +#define UVYH_EVENT_OCCURRED0_LH3_HCERR_SHFT 7 +#define UVYH_EVENT_OCCURRED0_LH3_HCERR_MASK 0x0000000000000080UL +#define UVYH_EVENT_OCCURRED0_XB_HCERR_SHFT 8 +#define UVYH_EVENT_OCCURRED0_XB_HCERR_MASK 0x0000000000000100UL +#define UVYH_EVENT_OCCURRED0_RDM_HCERR_SHFT 9 +#define UVYH_EVENT_OCCURRED0_RDM_HCERR_MASK 0x0000000000000200UL +#define UVYH_EVENT_OCCURRED0_NI0_HCERR_SHFT 10 +#define UVYH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000400UL +#define UVYH_EVENT_OCCURRED0_NI1_HCERR_SHFT 11 +#define UVYH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000800UL +#define UVYH_EVENT_OCCURRED0_LB_AOERR0_SHFT 12 +#define UVYH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000001000UL +#define UVYH_EVENT_OCCURRED0_KT_AOERR0_SHFT 13 +#define UVYH_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000002000UL +#define UVYH_EVENT_OCCURRED0_RH0_AOERR0_SHFT 14 +#define UVYH_EVENT_OCCURRED0_RH0_AOERR0_MASK 0x0000000000004000UL +#define UVYH_EVENT_OCCURRED0_RH1_AOERR0_SHFT 15 +#define UVYH_EVENT_OCCURRED0_RH1_AOERR0_MASK 0x0000000000008000UL +#define UVYH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 16 +#define UVYH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000010000UL +#define UVYH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 17 +#define UVYH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000020000UL +#define UVYH_EVENT_OCCURRED0_LH2_AOERR0_SHFT 18 +#define UVYH_EVENT_OCCURRED0_LH2_AOERR0_MASK 0x0000000000040000UL +#define UVYH_EVENT_OCCURRED0_LH3_AOERR0_SHFT 19 +#define UVYH_EVENT_OCCURRED0_LH3_AOERR0_MASK 0x0000000000080000UL +#define UVYH_EVENT_OCCURRED0_XB_AOERR0_SHFT 20 +#define UVYH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000100000UL +#define UVYH_EVENT_OCCURRED0_RDM_AOERR0_SHFT 21 +#define UVYH_EVENT_OCCURRED0_RDM_AOERR0_MASK 0x0000000000200000UL +#define UVYH_EVENT_OCCURRED0_RT0_AOERR0_SHFT 22 +#define UVYH_EVENT_OCCURRED0_RT0_AOERR0_MASK 0x0000000000400000UL +#define UVYH_EVENT_OCCURRED0_RT1_AOERR0_SHFT 23 +#define UVYH_EVENT_OCCURRED0_RT1_AOERR0_MASK 0x0000000000800000UL +#define UVYH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 24 +#define UVYH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000001000000UL +#define UVYH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 25 +#define UVYH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000002000000UL +#define UVYH_EVENT_OCCURRED0_LB_AOERR1_SHFT 26 +#define UVYH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000004000000UL +#define UVYH_EVENT_OCCURRED0_KT_AOERR1_SHFT 27 +#define UVYH_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000008000000UL +#define UVYH_EVENT_OCCURRED0_RH0_AOERR1_SHFT 28 +#define UVYH_EVENT_OCCURRED0_RH0_AOERR1_MASK 0x0000000010000000UL +#define UVYH_EVENT_OCCURRED0_RH1_AOERR1_SHFT 29 +#define UVYH_EVENT_OCCURRED0_RH1_AOERR1_MASK 0x0000000020000000UL +#define UVYH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 30 +#define UVYH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000040000000UL +#define UVYH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 31 +#define UVYH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000080000000UL +#define UVYH_EVENT_OCCURRED0_LH2_AOERR1_SHFT 32 +#define UVYH_EVENT_OCCURRED0_LH2_AOERR1_MASK 0x0000000100000000UL +#define UVYH_EVENT_OCCURRED0_LH3_AOERR1_SHFT 33 +#define UVYH_EVENT_OCCURRED0_LH3_AOERR1_MASK 0x0000000200000000UL +#define UVYH_EVENT_OCCURRED0_XB_AOERR1_SHFT 34 +#define UVYH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000400000000UL +#define UVYH_EVENT_OCCURRED0_RDM_AOERR1_SHFT 35 +#define UVYH_EVENT_OCCURRED0_RDM_AOERR1_MASK 0x0000000800000000UL +#define UVYH_EVENT_OCCURRED0_RT0_AOERR1_SHFT 36 +#define UVYH_EVENT_OCCURRED0_RT0_AOERR1_MASK 0x0000001000000000UL +#define UVYH_EVENT_OCCURRED0_RT1_AOERR1_SHFT 37 +#define UVYH_EVENT_OCCURRED0_RT1_AOERR1_MASK 0x0000002000000000UL +#define UVYH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 38 +#define UVYH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000004000000000UL +#define UVYH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 39 +#define UVYH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000008000000000UL +#define UVYH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 40 +#define UVYH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000010000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 41 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000020000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 42 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000040000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 43 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000080000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 44 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000100000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 45 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000200000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 46 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000400000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 47 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000800000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 48 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0001000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 49 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0002000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 50 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0004000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 51 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0008000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 52 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0010000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 53 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0020000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 54 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0040000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 55 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0080000000000000UL +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 56 +#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0100000000000000UL +#define UVYH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 57 +#define UVYH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0200000000000000UL +#define UVYH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 58 +#define UVYH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0400000000000000UL +#define UVYH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 59 +#define UVYH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0800000000000000UL +#define UVYH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 60 +#define UVYH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x1000000000000000UL +#define UVYH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 61 +#define UVYH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x2000000000000000UL + +/* UV4 unique defines */ #define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT 1 -#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10 -#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17 -#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18 -#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19 -#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20 -#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21 -#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22 -#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23 -#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24 -#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25 -#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26 -#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27 -#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28 -#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29 -#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30 -#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31 -#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32 -#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33 -#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34 -#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35 -#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36 -#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52 -#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53 -#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54 -#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55 -#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56 -#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57 -#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58 -#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59 -#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60 -#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61 -#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62 -#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63 #define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10 #define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17 #define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18 #define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19 #define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20 #define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21 #define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22 #define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23 #define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24 #define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25 #define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26 #define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27 #define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28 #define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29 #define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30 #define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31 #define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32 #define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33 #define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34 #define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35 #define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36 #define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37 #define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0002000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0004000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0008000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0010000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53 #define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0020000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54 #define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0040000000000000UL +#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55 #define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0080000000000000UL +#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56 #define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0100000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57 #define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0200000000000000UL +#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58 #define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0400000000000000UL +#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59 #define UV4H_EVENT_OCCURRED0_IPI_INT_MASK 0x0800000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60 #define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x1000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61 #define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x2000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62 #define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x4000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63 #define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL +/* UV3 unique defines */ +#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +/* UV2 unique defines */ +#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK ( \ + is_uv(UV4) ? 0x1000000000000000UL : \ + is_uv(UV3) ? 0x0040000000000000UL : \ + is_uv(UV2) ? 0x0040000000000000UL : \ + 0) #define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \ - is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ - is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ - /*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT) + is_uv(UV4) ? 60 : \ + is_uv(UV3) ? 54 : \ + is_uv(UV2) ? 54 : \ + -1) union uvh_event_occurred0_u { unsigned long v; + + /* UVH common struct */ struct uvh_event_occurred0_s { - unsigned long lb_hcerr:1; /* RW, W1C */ - unsigned long rsvd_1_10:10; - unsigned long rh_aoerr0:1; /* RW, W1C */ - unsigned long rsvd_12_63:52; + unsigned long lb_hcerr:1; /* RW */ + unsigned long rsvd_1_63:63; } s; + + /* UVXH common struct */ struct uvxh_event_occurred0_s { unsigned long lb_hcerr:1; /* RW */ unsigned long rsvd_1:1; @@ -505,6 +577,142 @@ union uvh_event_occurred0_u { unsigned long xb_aoerr0:1; /* RW */ unsigned long rsvd_17_63:47; } sx; + + /* UVYH common struct */ + struct uvyh_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long kt_hcerr:1; /* RW */ + unsigned long rh0_hcerr:1; /* RW */ + unsigned long rh1_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long lh2_hcerr:1; /* RW */ + unsigned long lh3_hcerr:1; /* RW */ + unsigned long xb_hcerr:1; /* RW */ + unsigned long rdm_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long kt_aoerr0:1; /* RW */ + unsigned long rh0_aoerr0:1; /* RW */ + unsigned long rh1_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long lh2_aoerr0:1; /* RW */ + unsigned long lh3_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rdm_aoerr0:1; /* RW */ + unsigned long rt0_aoerr0:1; /* RW */ + unsigned long rt1_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long kt_aoerr1:1; /* RW */ + unsigned long rh0_aoerr1:1; /* RW */ + unsigned long rh1_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long lh2_aoerr1:1; /* RW */ + unsigned long lh3_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rdm_aoerr1:1; /* RW */ + unsigned long rt0_aoerr1:1; /* RW */ + unsigned long rt1_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long rsvd_62_63:2; + } sy; + + /* UV5 unique struct */ + struct uv5h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long kt_hcerr:1; /* RW */ + unsigned long rh0_hcerr:1; /* RW */ + unsigned long rh1_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long lh2_hcerr:1; /* RW */ + unsigned long lh3_hcerr:1; /* RW */ + unsigned long xb_hcerr:1; /* RW */ + unsigned long rdm_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long kt_aoerr0:1; /* RW */ + unsigned long rh0_aoerr0:1; /* RW */ + unsigned long rh1_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long lh2_aoerr0:1; /* RW */ + unsigned long lh3_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rdm_aoerr0:1; /* RW */ + unsigned long rt0_aoerr0:1; /* RW */ + unsigned long rt1_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long kt_aoerr1:1; /* RW */ + unsigned long rh0_aoerr1:1; /* RW */ + unsigned long rh1_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long lh2_aoerr1:1; /* RW */ + unsigned long lh3_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rdm_aoerr1:1; /* RW */ + unsigned long rt0_aoerr1:1; /* RW */ + unsigned long rt1_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long rsvd_62_63:2; + } s5; + + /* UV4 unique struct */ struct uv4h_event_occurred0_s { unsigned long lb_hcerr:1; /* RW */ unsigned long kt_hcerr:1; /* RW */ @@ -571,13 +779,1355 @@ union uvh_event_occurred0_u { unsigned long extio_int2:1; /* RW */ unsigned long extio_int3:1; /* RW */ } s4; + + /* UV3 unique struct */ + struct uv3h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long qp_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long qp_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rt_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long qp_aoerr1:1; /* RW */ + unsigned long rh_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long gr0_aoerr1:1; /* RW */ + unsigned long gr1_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rt_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long rsvd_59_63:5; + } s3; + + /* UV2 unique struct */ + struct uv2h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long qp_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long qp_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rt_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long qp_aoerr1:1; /* RW */ + unsigned long rh_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long gr0_aoerr1:1; /* RW */ + unsigned long gr1_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rt_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long rsvd_59_63:5; + } s2; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0_ALIAS */ /* ========================================================================= */ #define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL -#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 + + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED1 */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED1 0x70080UL + + + +/* UVYH common defines */ +#define UVYH_EVENT_OCCURRED1_IPI_INT_SHFT 0 +#define UVYH_EVENT_OCCURRED1_IPI_INT_MASK 0x0000000000000001UL +#define UVYH_EVENT_OCCURRED1_EXTIO_INT0_SHFT 1 +#define UVYH_EVENT_OCCURRED1_EXTIO_INT0_MASK 0x0000000000000002UL +#define UVYH_EVENT_OCCURRED1_EXTIO_INT1_SHFT 2 +#define UVYH_EVENT_OCCURRED1_EXTIO_INT1_MASK 0x0000000000000004UL +#define UVYH_EVENT_OCCURRED1_EXTIO_INT2_SHFT 3 +#define UVYH_EVENT_OCCURRED1_EXTIO_INT2_MASK 0x0000000000000008UL +#define UVYH_EVENT_OCCURRED1_EXTIO_INT3_SHFT 4 +#define UVYH_EVENT_OCCURRED1_EXTIO_INT3_MASK 0x0000000000000010UL +#define UVYH_EVENT_OCCURRED1_PROFILE_INT_SHFT 5 +#define UVYH_EVENT_OCCURRED1_PROFILE_INT_MASK 0x0000000000000020UL +#define UVYH_EVENT_OCCURRED1_BAU_DATA_SHFT 6 +#define UVYH_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000040UL +#define UVYH_EVENT_OCCURRED1_PROC_GENERAL_SHFT 7 +#define UVYH_EVENT_OCCURRED1_PROC_GENERAL_MASK 0x0000000000000080UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT0_SHFT 8 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT0_MASK 0x0000000000000100UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT1_SHFT 9 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT1_MASK 0x0000000000000200UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT2_SHFT 10 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT2_MASK 0x0000000000000400UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT3_SHFT 11 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT3_MASK 0x0000000000000800UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT4_SHFT 12 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT4_MASK 0x0000000000001000UL +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT5_SHFT 13 +#define UVYH_EVENT_OCCURRED1_XH_TLB_INT5_MASK 0x0000000000002000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT0_SHFT 14 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT0_MASK 0x0000000000004000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT1_SHFT 15 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT1_MASK 0x0000000000008000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT2_SHFT 16 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT2_MASK 0x0000000000010000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT3_SHFT 17 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT3_MASK 0x0000000000020000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT4_SHFT 18 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT4_MASK 0x0000000000040000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT5_SHFT 19 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT5_MASK 0x0000000000080000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT6_SHFT 20 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT6_MASK 0x0000000000100000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT7_SHFT 21 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT7_MASK 0x0000000000200000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT8_SHFT 22 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT8_MASK 0x0000000000400000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT9_SHFT 23 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT9_MASK 0x0000000000800000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT10_SHFT 24 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT10_MASK 0x0000000001000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT11_SHFT 25 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT11_MASK 0x0000000002000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT12_SHFT 26 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT12_MASK 0x0000000004000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT13_SHFT 27 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT13_MASK 0x0000000008000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT14_SHFT 28 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT14_MASK 0x0000000010000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT15_SHFT 29 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT15_MASK 0x0000000020000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT16_SHFT 30 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT16_MASK 0x0000000040000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT17_SHFT 31 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT17_MASK 0x0000000080000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT18_SHFT 32 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT18_MASK 0x0000000100000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT19_SHFT 33 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT19_MASK 0x0000000200000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT20_SHFT 34 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT20_MASK 0x0000000400000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT21_SHFT 35 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT21_MASK 0x0000000800000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT22_SHFT 36 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT22_MASK 0x0000001000000000UL +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT23_SHFT 37 +#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT23_MASK 0x0000002000000000UL + +/* UV4 unique defines */ +#define UV4H_EVENT_OCCURRED1_PROFILE_INT_SHFT 0 +#define UV4H_EVENT_OCCURRED1_PROFILE_INT_MASK 0x0000000000000001UL +#define UV4H_EVENT_OCCURRED1_BAU_DATA_SHFT 1 +#define UV4H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED1_PROC_GENERAL_SHFT 2 +#define UV4H_EVENT_OCCURRED1_PROC_GENERAL_MASK 0x0000000000000004UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 3 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000000008UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 4 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000000010UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 5 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000000020UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 6 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000000040UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 7 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000000080UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 8 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000000100UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 9 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000000000200UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 10 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 11 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000000000800UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 12 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000000001000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 13 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000000002000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 14 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000000004000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 15 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000000008000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 16 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000000010000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 17 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 18 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT16_SHFT 19 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT16_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT17_SHFT 20 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT17_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT18_SHFT 21 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT18_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT19_SHFT 22 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT19_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT20_SHFT 23 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT20_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT21_SHFT 24 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT21_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT22_SHFT 25 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT22_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT23_SHFT 26 +#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT23_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 27 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 28 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 29 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 30 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 31 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 32 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 33 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 34 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 35 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 36 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 37 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 38 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 39 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 40 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 41 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 42 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT16_SHFT 43 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT16_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT17_SHFT 44 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT17_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT18_SHFT 45 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT18_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT19_SHFT 46 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT19_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT20_SHFT 47 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT20_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT21_SHFT 48 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT21_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT22_SHFT 49 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT22_MASK 0x0002000000000000UL +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT23_SHFT 50 +#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT23_MASK 0x0004000000000000UL + +/* UV3 unique defines */ +#define UV3H_EVENT_OCCURRED1_BAU_DATA_SHFT 0 +#define UV3H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000001UL +#define UV3H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_SHFT 1 +#define UV3H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_SHFT 2 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000004UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_SHFT 3 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000008UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_SHFT 4 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000010UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_SHFT 5 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000020UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_SHFT 6 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000040UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_SHFT 7 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000080UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_SHFT 8 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000100UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_SHFT 9 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000200UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_SHFT 10 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_SHFT 11 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000800UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_SHFT 12 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000001000UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_SHFT 13 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000002000UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_SHFT 14 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000004000UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_SHFT 15 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000008000UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_SHFT 16 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000010000UL +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_SHFT 17 +#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 18 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 19 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 20 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 21 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 22 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 23 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 24 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 25 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 26 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 27 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 28 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 29 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 30 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 31 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000080000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 32 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000100000000UL +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 33 +#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000200000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 34 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000400000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 35 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000800000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 36 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000001000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 37 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000002000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 38 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000004000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 39 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000008000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 40 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000010000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 41 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000020000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 42 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000040000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 43 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000080000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 44 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000100000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 45 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000200000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 46 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000400000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 47 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000800000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 48 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0001000000000000UL +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 49 +#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0002000000000000UL +#define UV3H_EVENT_OCCURRED1_RTC_INTERVAL_INT_SHFT 50 +#define UV3H_EVENT_OCCURRED1_RTC_INTERVAL_INT_MASK 0x0004000000000000UL +#define UV3H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_SHFT 51 +#define UV3H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_MASK 0x0008000000000000UL + +/* UV2 unique defines */ +#define UV2H_EVENT_OCCURRED1_BAU_DATA_SHFT 0 +#define UV2H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_SHFT 1 +#define UV2H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_SHFT 2 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_SHFT 3 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_SHFT 4 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_SHFT 5 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_SHFT 6 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_SHFT 7 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_SHFT 8 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_SHFT 9 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_SHFT 10 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_SHFT 11 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_SHFT 12 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_SHFT 13 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_SHFT 14 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_SHFT 15 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_SHFT 16 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_SHFT 17 +#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 18 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 19 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 20 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 21 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 22 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 23 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 24 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 25 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 26 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 27 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 28 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 29 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 30 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 31 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 32 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 33 +#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 34 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 35 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 36 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 37 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 38 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 39 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 40 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 41 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 42 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 43 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 44 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 45 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 46 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 47 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 48 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 49 +#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED1_RTC_INTERVAL_INT_SHFT 50 +#define UV2H_EVENT_OCCURRED1_RTC_INTERVAL_INT_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_SHFT 51 +#define UV2H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_MASK 0x0008000000000000UL + +#define UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK ( \ + is_uv(UV5) ? 0x0000000000000002UL : \ + 0) +#define UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT ( \ + is_uv(UV5) ? 1 : \ + -1) + +union uvyh_event_occurred1_u { + unsigned long v; + + /* UVYH common struct */ + struct uvyh_event_occurred1_s { + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long bau_data:1; /* RW */ + unsigned long proc_general:1; /* RW */ + unsigned long xh_tlb_int0:1; /* RW */ + unsigned long xh_tlb_int1:1; /* RW */ + unsigned long xh_tlb_int2:1; /* RW */ + unsigned long xh_tlb_int3:1; /* RW */ + unsigned long xh_tlb_int4:1; /* RW */ + unsigned long xh_tlb_int5:1; /* RW */ + unsigned long rdm_tlb_int0:1; /* RW */ + unsigned long rdm_tlb_int1:1; /* RW */ + unsigned long rdm_tlb_int2:1; /* RW */ + unsigned long rdm_tlb_int3:1; /* RW */ + unsigned long rdm_tlb_int4:1; /* RW */ + unsigned long rdm_tlb_int5:1; /* RW */ + unsigned long rdm_tlb_int6:1; /* RW */ + unsigned long rdm_tlb_int7:1; /* RW */ + unsigned long rdm_tlb_int8:1; /* RW */ + unsigned long rdm_tlb_int9:1; /* RW */ + unsigned long rdm_tlb_int10:1; /* RW */ + unsigned long rdm_tlb_int11:1; /* RW */ + unsigned long rdm_tlb_int12:1; /* RW */ + unsigned long rdm_tlb_int13:1; /* RW */ + unsigned long rdm_tlb_int14:1; /* RW */ + unsigned long rdm_tlb_int15:1; /* RW */ + unsigned long rdm_tlb_int16:1; /* RW */ + unsigned long rdm_tlb_int17:1; /* RW */ + unsigned long rdm_tlb_int18:1; /* RW */ + unsigned long rdm_tlb_int19:1; /* RW */ + unsigned long rdm_tlb_int20:1; /* RW */ + unsigned long rdm_tlb_int21:1; /* RW */ + unsigned long rdm_tlb_int22:1; /* RW */ + unsigned long rdm_tlb_int23:1; /* RW */ + unsigned long rsvd_38_63:26; + } sy; + + /* UV5 unique struct */ + struct uv5h_event_occurred1_s { + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long bau_data:1; /* RW */ + unsigned long proc_general:1; /* RW */ + unsigned long xh_tlb_int0:1; /* RW */ + unsigned long xh_tlb_int1:1; /* RW */ + unsigned long xh_tlb_int2:1; /* RW */ + unsigned long xh_tlb_int3:1; /* RW */ + unsigned long xh_tlb_int4:1; /* RW */ + unsigned long xh_tlb_int5:1; /* RW */ + unsigned long rdm_tlb_int0:1; /* RW */ + unsigned long rdm_tlb_int1:1; /* RW */ + unsigned long rdm_tlb_int2:1; /* RW */ + unsigned long rdm_tlb_int3:1; /* RW */ + unsigned long rdm_tlb_int4:1; /* RW */ + unsigned long rdm_tlb_int5:1; /* RW */ + unsigned long rdm_tlb_int6:1; /* RW */ + unsigned long rdm_tlb_int7:1; /* RW */ + unsigned long rdm_tlb_int8:1; /* RW */ + unsigned long rdm_tlb_int9:1; /* RW */ + unsigned long rdm_tlb_int10:1; /* RW */ + unsigned long rdm_tlb_int11:1; /* RW */ + unsigned long rdm_tlb_int12:1; /* RW */ + unsigned long rdm_tlb_int13:1; /* RW */ + unsigned long rdm_tlb_int14:1; /* RW */ + unsigned long rdm_tlb_int15:1; /* RW */ + unsigned long rdm_tlb_int16:1; /* RW */ + unsigned long rdm_tlb_int17:1; /* RW */ + unsigned long rdm_tlb_int18:1; /* RW */ + unsigned long rdm_tlb_int19:1; /* RW */ + unsigned long rdm_tlb_int20:1; /* RW */ + unsigned long rdm_tlb_int21:1; /* RW */ + unsigned long rdm_tlb_int22:1; /* RW */ + unsigned long rdm_tlb_int23:1; /* RW */ + unsigned long rsvd_38_63:26; + } s5; + + /* UV4 unique struct */ + struct uv4h_event_occurred1_s { + unsigned long profile_int:1; /* RW */ + unsigned long bau_data:1; /* RW */ + unsigned long proc_general:1; /* RW */ + unsigned long gr0_tlb_int0:1; /* RW */ + unsigned long gr0_tlb_int1:1; /* RW */ + unsigned long gr0_tlb_int2:1; /* RW */ + unsigned long gr0_tlb_int3:1; /* RW */ + unsigned long gr0_tlb_int4:1; /* RW */ + unsigned long gr0_tlb_int5:1; /* RW */ + unsigned long gr0_tlb_int6:1; /* RW */ + unsigned long gr0_tlb_int7:1; /* RW */ + unsigned long gr0_tlb_int8:1; /* RW */ + unsigned long gr0_tlb_int9:1; /* RW */ + unsigned long gr0_tlb_int10:1; /* RW */ + unsigned long gr0_tlb_int11:1; /* RW */ + unsigned long gr0_tlb_int12:1; /* RW */ + unsigned long gr0_tlb_int13:1; /* RW */ + unsigned long gr0_tlb_int14:1; /* RW */ + unsigned long gr0_tlb_int15:1; /* RW */ + unsigned long gr0_tlb_int16:1; /* RW */ + unsigned long gr0_tlb_int17:1; /* RW */ + unsigned long gr0_tlb_int18:1; /* RW */ + unsigned long gr0_tlb_int19:1; /* RW */ + unsigned long gr0_tlb_int20:1; /* RW */ + unsigned long gr0_tlb_int21:1; /* RW */ + unsigned long gr0_tlb_int22:1; /* RW */ + unsigned long gr0_tlb_int23:1; /* RW */ + unsigned long gr1_tlb_int0:1; /* RW */ + unsigned long gr1_tlb_int1:1; /* RW */ + unsigned long gr1_tlb_int2:1; /* RW */ + unsigned long gr1_tlb_int3:1; /* RW */ + unsigned long gr1_tlb_int4:1; /* RW */ + unsigned long gr1_tlb_int5:1; /* RW */ + unsigned long gr1_tlb_int6:1; /* RW */ + unsigned long gr1_tlb_int7:1; /* RW */ + unsigned long gr1_tlb_int8:1; /* RW */ + unsigned long gr1_tlb_int9:1; /* RW */ + unsigned long gr1_tlb_int10:1; /* RW */ + unsigned long gr1_tlb_int11:1; /* RW */ + unsigned long gr1_tlb_int12:1; /* RW */ + unsigned long gr1_tlb_int13:1; /* RW */ + unsigned long gr1_tlb_int14:1; /* RW */ + unsigned long gr1_tlb_int15:1; /* RW */ + unsigned long gr1_tlb_int16:1; /* RW */ + unsigned long gr1_tlb_int17:1; /* RW */ + unsigned long gr1_tlb_int18:1; /* RW */ + unsigned long gr1_tlb_int19:1; /* RW */ + unsigned long gr1_tlb_int20:1; /* RW */ + unsigned long gr1_tlb_int21:1; /* RW */ + unsigned long gr1_tlb_int22:1; /* RW */ + unsigned long gr1_tlb_int23:1; /* RW */ + unsigned long rsvd_51_63:13; + } s4; + + /* UV3 unique struct */ + struct uv3h_event_occurred1_s { + unsigned long bau_data:1; /* RW */ + unsigned long power_management_req:1; /* RW */ + unsigned long message_accelerator_int0:1; /* RW */ + unsigned long message_accelerator_int1:1; /* RW */ + unsigned long message_accelerator_int2:1; /* RW */ + unsigned long message_accelerator_int3:1; /* RW */ + unsigned long message_accelerator_int4:1; /* RW */ + unsigned long message_accelerator_int5:1; /* RW */ + unsigned long message_accelerator_int6:1; /* RW */ + unsigned long message_accelerator_int7:1; /* RW */ + unsigned long message_accelerator_int8:1; /* RW */ + unsigned long message_accelerator_int9:1; /* RW */ + unsigned long message_accelerator_int10:1; /* RW */ + unsigned long message_accelerator_int11:1; /* RW */ + unsigned long message_accelerator_int12:1; /* RW */ + unsigned long message_accelerator_int13:1; /* RW */ + unsigned long message_accelerator_int14:1; /* RW */ + unsigned long message_accelerator_int15:1; /* RW */ + unsigned long gr0_tlb_int0:1; /* RW */ + unsigned long gr0_tlb_int1:1; /* RW */ + unsigned long gr0_tlb_int2:1; /* RW */ + unsigned long gr0_tlb_int3:1; /* RW */ + unsigned long gr0_tlb_int4:1; /* RW */ + unsigned long gr0_tlb_int5:1; /* RW */ + unsigned long gr0_tlb_int6:1; /* RW */ + unsigned long gr0_tlb_int7:1; /* RW */ + unsigned long gr0_tlb_int8:1; /* RW */ + unsigned long gr0_tlb_int9:1; /* RW */ + unsigned long gr0_tlb_int10:1; /* RW */ + unsigned long gr0_tlb_int11:1; /* RW */ + unsigned long gr0_tlb_int12:1; /* RW */ + unsigned long gr0_tlb_int13:1; /* RW */ + unsigned long gr0_tlb_int14:1; /* RW */ + unsigned long gr0_tlb_int15:1; /* RW */ + unsigned long gr1_tlb_int0:1; /* RW */ + unsigned long gr1_tlb_int1:1; /* RW */ + unsigned long gr1_tlb_int2:1; /* RW */ + unsigned long gr1_tlb_int3:1; /* RW */ + unsigned long gr1_tlb_int4:1; /* RW */ + unsigned long gr1_tlb_int5:1; /* RW */ + unsigned long gr1_tlb_int6:1; /* RW */ + unsigned long gr1_tlb_int7:1; /* RW */ + unsigned long gr1_tlb_int8:1; /* RW */ + unsigned long gr1_tlb_int9:1; /* RW */ + unsigned long gr1_tlb_int10:1; /* RW */ + unsigned long gr1_tlb_int11:1; /* RW */ + unsigned long gr1_tlb_int12:1; /* RW */ + unsigned long gr1_tlb_int13:1; /* RW */ + unsigned long gr1_tlb_int14:1; /* RW */ + unsigned long gr1_tlb_int15:1; /* RW */ + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rsvd_52_63:12; + } s3; + + /* UV2 unique struct */ + struct uv2h_event_occurred1_s { + unsigned long bau_data:1; /* RW */ + unsigned long power_management_req:1; /* RW */ + unsigned long message_accelerator_int0:1; /* RW */ + unsigned long message_accelerator_int1:1; /* RW */ + unsigned long message_accelerator_int2:1; /* RW */ + unsigned long message_accelerator_int3:1; /* RW */ + unsigned long message_accelerator_int4:1; /* RW */ + unsigned long message_accelerator_int5:1; /* RW */ + unsigned long message_accelerator_int6:1; /* RW */ + unsigned long message_accelerator_int7:1; /* RW */ + unsigned long message_accelerator_int8:1; /* RW */ + unsigned long message_accelerator_int9:1; /* RW */ + unsigned long message_accelerator_int10:1; /* RW */ + unsigned long message_accelerator_int11:1; /* RW */ + unsigned long message_accelerator_int12:1; /* RW */ + unsigned long message_accelerator_int13:1; /* RW */ + unsigned long message_accelerator_int14:1; /* RW */ + unsigned long message_accelerator_int15:1; /* RW */ + unsigned long gr0_tlb_int0:1; /* RW */ + unsigned long gr0_tlb_int1:1; /* RW */ + unsigned long gr0_tlb_int2:1; /* RW */ + unsigned long gr0_tlb_int3:1; /* RW */ + unsigned long gr0_tlb_int4:1; /* RW */ + unsigned long gr0_tlb_int5:1; /* RW */ + unsigned long gr0_tlb_int6:1; /* RW */ + unsigned long gr0_tlb_int7:1; /* RW */ + unsigned long gr0_tlb_int8:1; /* RW */ + unsigned long gr0_tlb_int9:1; /* RW */ + unsigned long gr0_tlb_int10:1; /* RW */ + unsigned long gr0_tlb_int11:1; /* RW */ + unsigned long gr0_tlb_int12:1; /* RW */ + unsigned long gr0_tlb_int13:1; /* RW */ + unsigned long gr0_tlb_int14:1; /* RW */ + unsigned long gr0_tlb_int15:1; /* RW */ + unsigned long gr1_tlb_int0:1; /* RW */ + unsigned long gr1_tlb_int1:1; /* RW */ + unsigned long gr1_tlb_int2:1; /* RW */ + unsigned long gr1_tlb_int3:1; /* RW */ + unsigned long gr1_tlb_int4:1; /* RW */ + unsigned long gr1_tlb_int5:1; /* RW */ + unsigned long gr1_tlb_int6:1; /* RW */ + unsigned long gr1_tlb_int7:1; /* RW */ + unsigned long gr1_tlb_int8:1; /* RW */ + unsigned long gr1_tlb_int9:1; /* RW */ + unsigned long gr1_tlb_int10:1; /* RW */ + unsigned long gr1_tlb_int11:1; /* RW */ + unsigned long gr1_tlb_int12:1; /* RW */ + unsigned long gr1_tlb_int13:1; /* RW */ + unsigned long gr1_tlb_int14:1; /* RW */ + unsigned long gr1_tlb_int15:1; /* RW */ + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rsvd_52_63:12; + } s2; +}; + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED1_ALIAS */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED1_ALIAS 0x70088UL + + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED2 */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED2 0x70100UL + + + +/* UVYH common defines */ +#define UVYH_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 0 +#define UVYH_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000000001UL +#define UVYH_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 1 +#define UVYH_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000000002UL +#define UVYH_EVENT_OCCURRED2_RTC_0_SHFT 2 +#define UVYH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000004UL +#define UVYH_EVENT_OCCURRED2_RTC_1_SHFT 3 +#define UVYH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000008UL +#define UVYH_EVENT_OCCURRED2_RTC_2_SHFT 4 +#define UVYH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000010UL +#define UVYH_EVENT_OCCURRED2_RTC_3_SHFT 5 +#define UVYH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000020UL +#define UVYH_EVENT_OCCURRED2_RTC_4_SHFT 6 +#define UVYH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000040UL +#define UVYH_EVENT_OCCURRED2_RTC_5_SHFT 7 +#define UVYH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000080UL +#define UVYH_EVENT_OCCURRED2_RTC_6_SHFT 8 +#define UVYH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000100UL +#define UVYH_EVENT_OCCURRED2_RTC_7_SHFT 9 +#define UVYH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000200UL +#define UVYH_EVENT_OCCURRED2_RTC_8_SHFT 10 +#define UVYH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000400UL +#define UVYH_EVENT_OCCURRED2_RTC_9_SHFT 11 +#define UVYH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000800UL +#define UVYH_EVENT_OCCURRED2_RTC_10_SHFT 12 +#define UVYH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000001000UL +#define UVYH_EVENT_OCCURRED2_RTC_11_SHFT 13 +#define UVYH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000002000UL +#define UVYH_EVENT_OCCURRED2_RTC_12_SHFT 14 +#define UVYH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000004000UL +#define UVYH_EVENT_OCCURRED2_RTC_13_SHFT 15 +#define UVYH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000008000UL +#define UVYH_EVENT_OCCURRED2_RTC_14_SHFT 16 +#define UVYH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000010000UL +#define UVYH_EVENT_OCCURRED2_RTC_15_SHFT 17 +#define UVYH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000020000UL +#define UVYH_EVENT_OCCURRED2_RTC_16_SHFT 18 +#define UVYH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000040000UL +#define UVYH_EVENT_OCCURRED2_RTC_17_SHFT 19 +#define UVYH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000080000UL +#define UVYH_EVENT_OCCURRED2_RTC_18_SHFT 20 +#define UVYH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000100000UL +#define UVYH_EVENT_OCCURRED2_RTC_19_SHFT 21 +#define UVYH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000200000UL +#define UVYH_EVENT_OCCURRED2_RTC_20_SHFT 22 +#define UVYH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000400000UL +#define UVYH_EVENT_OCCURRED2_RTC_21_SHFT 23 +#define UVYH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000800000UL +#define UVYH_EVENT_OCCURRED2_RTC_22_SHFT 24 +#define UVYH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000001000000UL +#define UVYH_EVENT_OCCURRED2_RTC_23_SHFT 25 +#define UVYH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000002000000UL +#define UVYH_EVENT_OCCURRED2_RTC_24_SHFT 26 +#define UVYH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000004000000UL +#define UVYH_EVENT_OCCURRED2_RTC_25_SHFT 27 +#define UVYH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000008000000UL +#define UVYH_EVENT_OCCURRED2_RTC_26_SHFT 28 +#define UVYH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000010000000UL +#define UVYH_EVENT_OCCURRED2_RTC_27_SHFT 29 +#define UVYH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000020000000UL +#define UVYH_EVENT_OCCURRED2_RTC_28_SHFT 30 +#define UVYH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000040000000UL +#define UVYH_EVENT_OCCURRED2_RTC_29_SHFT 31 +#define UVYH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000080000000UL +#define UVYH_EVENT_OCCURRED2_RTC_30_SHFT 32 +#define UVYH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000100000000UL +#define UVYH_EVENT_OCCURRED2_RTC_31_SHFT 33 +#define UVYH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000200000000UL + +/* UV4 unique defines */ +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16 +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17 +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18 +#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19 +#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20 +#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21 +#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22 +#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23 +#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24 +#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25 +#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26 +#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27 +#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28 +#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29 +#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30 +#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31 +#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32 +#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33 +#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34 +#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35 +#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36 +#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37 +#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38 +#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39 +#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40 +#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41 +#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42 +#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43 +#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44 +#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45 +#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46 +#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47 +#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48 +#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49 +#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL + +/* UV3 unique defines */ +#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +/* UV2 unique defines */ +#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +#define UVH_EVENT_OCCURRED2_RTC_1_MASK ( \ + is_uv(UV5) ? 0x0000000000000008UL : \ + is_uv(UV4) ? 0x0000000000080000UL : \ + is_uv(UV3) ? 0x0000000000000002UL : \ + is_uv(UV2) ? 0x0000000000000002UL : \ + 0) +#define UVH_EVENT_OCCURRED2_RTC_1_SHFT ( \ + is_uv(UV5) ? 3 : \ + is_uv(UV4) ? 19 : \ + is_uv(UV3) ? 1 : \ + is_uv(UV2) ? 1 : \ + -1) + +union uvyh_event_occurred2_u { + unsigned long v; + + /* UVYH common struct */ + struct uvyh_event_occurred2_s { + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_34_63:30; + } sy; + + /* UV5 unique struct */ + struct uv5h_event_occurred2_s { + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_34_63:30; + } s5; + + /* UV4 unique struct */ + struct uv4h_event_occurred2_s { + unsigned long message_accelerator_int0:1; /* RW */ + unsigned long message_accelerator_int1:1; /* RW */ + unsigned long message_accelerator_int2:1; /* RW */ + unsigned long message_accelerator_int3:1; /* RW */ + unsigned long message_accelerator_int4:1; /* RW */ + unsigned long message_accelerator_int5:1; /* RW */ + unsigned long message_accelerator_int6:1; /* RW */ + unsigned long message_accelerator_int7:1; /* RW */ + unsigned long message_accelerator_int8:1; /* RW */ + unsigned long message_accelerator_int9:1; /* RW */ + unsigned long message_accelerator_int10:1; /* RW */ + unsigned long message_accelerator_int11:1; /* RW */ + unsigned long message_accelerator_int12:1; /* RW */ + unsigned long message_accelerator_int13:1; /* RW */ + unsigned long message_accelerator_int14:1; /* RW */ + unsigned long message_accelerator_int15:1; /* RW */ + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_50_63:14; + } s4; + + /* UV3 unique struct */ + struct uv3h_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } s3; + + /* UV2 unique struct */ + struct uv2h_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } s2; +}; + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED2_ALIAS */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED2_ALIAS 0x70108UL /* ========================================================================= */ @@ -585,51 +2135,148 @@ union uvh_event_occurred0_u { /* ========================================================================= */ #define UVH_EXTIO_INT0_BROADCAST 0x61448UL -#define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0 -#define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0 -#define UV4H_EXTIO_INT0_BROADCAST_32 0x310 -#define UVH_EXTIO_INT0_BROADCAST_32 ( \ - is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 : \ - is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 : \ - /*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32) - +/* UVH common defines*/ #define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT 0 #define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK 0x0000000000000001UL union uvh_extio_int0_broadcast_u { unsigned long v; + + /* UVH common struct */ struct uvh_extio_int0_broadcast_s { unsigned long enable:1; /* RW */ unsigned long rsvd_1_63:63; } s; + + /* UV5 unique struct */ + struct uv5h_extio_int0_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s5; + + /* UV4 unique struct */ + struct uv4h_extio_int0_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s4; + + /* UV3 unique struct */ + struct uv3h_extio_int0_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s3; + + /* UV2 unique struct */ + struct uv2h_extio_int0_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s2; +}; + +/* ========================================================================= */ +/* UVH_GR0_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UVH_GR0_GAM_GR_CONFIG ( \ + is_uv(UV5) ? 0x600028UL : \ + is_uv(UV4) ? 0x600028UL : \ + is_uv(UV3) ? 0xc00028UL : \ + is_uv(UV2) ? 0xc00028UL : \ + 0) + + + +/* UVYH common defines */ +#define UVYH_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UVYH_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +/* UV4 unique defines */ +#define UV4H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV4H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +/* UV3 unique defines */ +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +/* UV2 unique defines */ +#define UV2H_GR0_GAM_GR_CONFIG_N_GR_SHFT 0 +#define UV2H_GR0_GAM_GR_CONFIG_N_GR_MASK 0x000000000000000fUL + + +union uvyh_gr0_gam_gr_config_u { + unsigned long v; + + /* UVYH common struct */ + struct uvyh_gr0_gam_gr_config_s { + unsigned long rsvd_0_9:10; + unsigned long subspace:1; /* RW */ + unsigned long rsvd_11_63:53; + } sy; + + /* UV5 unique struct */ + struct uv5h_gr0_gam_gr_config_s { + unsigned long rsvd_0_9:10; + unsigned long subspace:1; /* RW */ + unsigned long rsvd_11_63:53; + } s5; + + /* UV4 unique struct */ + struct uv4h_gr0_gam_gr_config_s { + unsigned long rsvd_0_9:10; + unsigned long subspace:1; /* RW */ + unsigned long rsvd_11_63:53; + } s4; + + /* UV3 unique struct */ + struct uv3h_gr0_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; + + /* UV2 unique struct */ + struct uv2h_gr0_gam_gr_config_s { + unsigned long n_gr:4; /* RW */ + unsigned long reserved:60; + } s2; }; /* ========================================================================= */ /* UVH_GR0_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL - -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT0_CONFIG ( \ + is_uv(UV4) ? 0x61b00UL : \ + is_uv(UV3) ? 0x61b00UL : \ + is_uv(UV2) ? 0x61b00UL : \ + uv_undefined("UVH_GR0_TLB_INT0_CONFIG")) + + +/* UVXH common defines */ +#define UVXH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVXH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVXH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVXH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVXH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVXH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVXH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVXH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVXH_GR0_TLB_INT0_CONFIG_P_SHFT 13 +#define UVXH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVXH_GR0_TLB_INT0_CONFIG_T_SHFT 15 +#define UVXH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVXH_GR0_TLB_INT0_CONFIG_M_SHFT 16 +#define UVXH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVXH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVXH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int0_config_u { unsigned long v; + + /* UVH common struct */ struct uvh_gr0_tlb_int0_config_s { unsigned long vector_:8; /* RW */ unsigned long dm:3; /* RW */ @@ -642,33 +2289,97 @@ union uvh_gr0_tlb_int0_config_u { unsigned long rsvd_17_31:15; unsigned long apic_id:32; /* RW */ } s; + + /* UVXH common struct */ + struct uvxh_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } sx; + + /* UV4 unique struct */ + struct uv4h_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s4; + + /* UV3 unique struct */ + struct uv3h_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s3; + + /* UV2 unique struct */ + struct uv2h_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_GR0_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL - -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT1_CONFIG ( \ + is_uv(UV4) ? 0x61b40UL : \ + is_uv(UV3) ? 0x61b40UL : \ + is_uv(UV2) ? 0x61b40UL : \ + uv_undefined("UVH_GR0_TLB_INT1_CONFIG")) + + +/* UVXH common defines */ +#define UVXH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVXH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVXH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVXH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVXH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVXH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVXH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVXH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVXH_GR0_TLB_INT1_CONFIG_P_SHFT 13 +#define UVXH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVXH_GR0_TLB_INT1_CONFIG_T_SHFT 15 +#define UVXH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVXH_GR0_TLB_INT1_CONFIG_M_SHFT 16 +#define UVXH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVXH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVXH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int1_config_u { unsigned long v; + + /* UVH common struct */ struct uvh_gr0_tlb_int1_config_s { unsigned long vector_:8; /* RW */ unsigned long dm:3; /* RW */ @@ -681,382 +2392,97 @@ union uvh_gr0_tlb_int1_config_u { unsigned long rsvd_17_31:15; unsigned long apic_id:32; /* RW */ } s; -}; -/* ========================================================================= */ -/* UVH_GR0_TLB_MMR_CONTROL */ -/* ========================================================================= */ -#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL -#define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL -#define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL -#define UVH_GR0_TLB_MMR_CONTROL ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL) - -#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL - -#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL - -#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 -#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL -#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL - -#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL - -#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 -#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 -#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL -#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL -#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL -#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL - -#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK) -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK) -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT) - -union uvh_gr0_tlb_mmr_control_u { - unsigned long v; - struct uvh_gr0_tlb_mmr_control_s { - unsigned long rsvd_0_15:16; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_48:17; - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_63:12; - } s; - struct uvxh_gr0_tlb_mmr_control_s { - unsigned long rsvd_0_15:16; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long rsvd_48:1; - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_63:12; + /* UVXH common struct */ + struct uvxh_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } sx; - struct uv2h_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long mmr_inj_con:1; /* RW */ - unsigned long rsvd_49_51:3; - unsigned long mmr_inj_tlbram:1; /* RW */ - unsigned long rsvd_53_63:11; - } s2; - struct uv3h_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long ecc_sel:1; /* RW */ - unsigned long rsvd_22_29:8; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long undef_48:1; /* Undefined */ - unsigned long rsvd_49_51:3; - unsigned long undef_52:1; /* Undefined */ - unsigned long rsvd_53_63:11; - } s3; - struct uv4h_gr0_tlb_mmr_control_s { - unsigned long index:13; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_15:1; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long ecc_sel:1; /* RW */ - unsigned long rsvd_22_29:8; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long undef_48:1; /* Undefined */ - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_58:7; - unsigned long page_size:5; /* RW */ + + /* UV4 unique struct */ + struct uv4h_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s4; -}; -/* ========================================================================= */ -/* UVH_GR0_TLB_MMR_READ_DATA_HI */ -/* ========================================================================= */ -#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI) - -#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 - -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 - -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL - -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL -#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL - -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL - - -union uvh_gr0_tlb_mmr_read_data_hi_u { - unsigned long v; - struct uv2h_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s2; - struct uv3h_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long aa_ext:1; /* RO */ - unsigned long undef_46_54:9; /* Undefined */ - unsigned long way_ecc:9; /* RO */ + /* UV3 unique struct */ + struct uv3h_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s3; - struct uv4h_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:34; /* RO */ - unsigned long pnid:15; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long aa_ext:1; /* RO */ - unsigned long undef_54:1; /* Undefined */ - unsigned long way_ecc:9; /* RO */ - } s4; -}; -/* ========================================================================= */ -/* UVH_GR0_TLB_MMR_READ_DATA_LO */ -/* ========================================================================= */ -#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL -#define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL -#define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL -#define UVH_GR0_TLB_MMR_READ_DATA_LO ( \ - is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ - is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO : \ - /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO) - -#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - - -union uvh_gr0_tlb_mmr_read_data_lo_u { - unsigned long v; - struct uvh_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s; - struct uvxh_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } sx; - struct uv2h_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ + /* UV2 unique struct */ + struct uv2h_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s2; - struct uv3h_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s3; - struct uv4h_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s4; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL -#define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL -#define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL #define UVH_GR1_TLB_INT0_CONFIG ( \ - is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG : \ - is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG : \ - /*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG) - -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + is_uv(UV4) ? 0x62100UL : \ + is_uv(UV3) ? 0x61f00UL : \ + is_uv(UV2) ? 0x61f00UL : \ + uv_undefined("UVH_GR1_TLB_INT0_CONFIG")) + + +/* UVXH common defines */ +#define UVXH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVXH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVXH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVXH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVXH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVXH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVXH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVXH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVXH_GR1_TLB_INT0_CONFIG_P_SHFT 13 +#define UVXH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVXH_GR1_TLB_INT0_CONFIG_T_SHFT 15 +#define UVXH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVXH_GR1_TLB_INT0_CONFIG_M_SHFT 16 +#define UVXH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVXH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVXH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int0_config_u { unsigned long v; + + /* UVH common struct */ struct uvh_gr1_tlb_int0_config_s { unsigned long vector_:8; /* RW */ unsigned long dm:3; /* RW */ @@ -1069,39 +2495,97 @@ union uvh_gr1_tlb_int0_config_u { unsigned long rsvd_17_31:15; unsigned long apic_id:32; /* RW */ } s; + + /* UVXH common struct */ + struct uvxh_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } sx; + + /* UV4 unique struct */ + struct uv4h_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s4; + + /* UV3 unique struct */ + struct uv3h_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s3; + + /* UV2 unique struct */ + struct uv2h_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL -#define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL -#define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL #define UVH_GR1_TLB_INT1_CONFIG ( \ - is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG : \ - is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG : \ - /*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG) - -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + is_uv(UV4) ? 0x62140UL : \ + is_uv(UV3) ? 0x61f40UL : \ + is_uv(UV2) ? 0x61f40UL : \ + uv_undefined("UVH_GR1_TLB_INT1_CONFIG")) + + +/* UVXH common defines */ +#define UVXH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVXH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVXH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVXH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVXH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVXH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVXH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVXH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVXH_GR1_TLB_INT1_CONFIG_P_SHFT 13 +#define UVXH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVXH_GR1_TLB_INT1_CONFIG_T_SHFT 15 +#define UVXH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVXH_GR1_TLB_INT1_CONFIG_M_SHFT 16 +#define UVXH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVXH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVXH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int1_config_u { unsigned long v; + + /* UVH common struct */ struct uvh_gr1_tlb_int1_config_s { unsigned long vector_:8; /* RW */ unsigned long dm:3; /* RW */ @@ -1114,337 +2598,62 @@ union uvh_gr1_tlb_int1_config_u { unsigned long rsvd_17_31:15; unsigned long apic_id:32; /* RW */ } s; -}; -/* ========================================================================= */ -/* UVH_GR1_TLB_MMR_CONTROL */ -/* ========================================================================= */ -#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL -#define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL -#define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL -#define UVH_GR1_TLB_MMR_CONTROL ( \ - is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ - is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL : \ - /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL) - -#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL - -#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL - -#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 -#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL -#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL - -#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL - -#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 -#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 -#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL -#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL -#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL -#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL - - -union uvh_gr1_tlb_mmr_control_u { - unsigned long v; - struct uvh_gr1_tlb_mmr_control_s { - unsigned long rsvd_0_15:16; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_48:17; - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_63:12; - } s; - struct uvxh_gr1_tlb_mmr_control_s { - unsigned long rsvd_0_15:16; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long rsvd_48:1; - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_63:12; + /* UVXH common struct */ + struct uvxh_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } sx; - struct uv2h_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long mmr_inj_con:1; /* RW */ - unsigned long rsvd_49_51:3; - unsigned long mmr_inj_tlbram:1; /* RW */ - unsigned long rsvd_53_63:11; - } s2; - struct uv3h_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long ecc_sel:1; /* RW */ - unsigned long rsvd_22_29:8; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long undef_48:1; /* Undefined */ - unsigned long rsvd_49_51:3; - unsigned long undef_52:1; /* Undefined */ - unsigned long rsvd_53_63:11; - } s3; - struct uv4h_gr1_tlb_mmr_control_s { - unsigned long index:13; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_15:1; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long ecc_sel:1; /* RW */ - unsigned long rsvd_22_29:8; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long mmr_op_done:1; /* RW */ - unsigned long rsvd_33_47:15; - unsigned long undef_48:1; /* Undefined */ - unsigned long rsvd_49_51:3; - unsigned long rsvd_52_58:7; - unsigned long page_size:5; /* RW */ + + /* UV4 unique struct */ + struct uv4h_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s4; -}; -/* ========================================================================= */ -/* UVH_GR1_TLB_MMR_READ_DATA_HI */ -/* ========================================================================= */ -#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI ( \ - is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ - is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI : \ - /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI) - -#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 - -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 - -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL - -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL -#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL - -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL - - -union uvh_gr1_tlb_mmr_read_data_hi_u { - unsigned long v; - struct uv2h_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s2; - struct uv3h_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long aa_ext:1; /* RO */ - unsigned long undef_46_54:9; /* Undefined */ - unsigned long way_ecc:9; /* RO */ + /* UV3 unique struct */ + struct uv3h_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s3; - struct uv4h_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:34; /* RO */ - unsigned long pnid:15; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long aa_ext:1; /* RO */ - unsigned long undef_54:1; /* Undefined */ - unsigned long way_ecc:9; /* RO */ - } s4; -}; -/* ========================================================================= */ -/* UVH_GR1_TLB_MMR_READ_DATA_LO */ -/* ========================================================================= */ -#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL -#define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL -#define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL -#define UVH_GR1_TLB_MMR_READ_DATA_LO ( \ - is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ - is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO : \ - /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO) - -#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - - -union uvh_gr1_tlb_mmr_read_data_lo_u { - unsigned long v; - struct uvh_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s; - struct uvxh_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } sx; - struct uv2h_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ + /* UV2 unique struct */ + struct uv2h_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s2; - struct uv3h_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s3; - struct uv4h_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s4; }; /* ========================================================================= */ @@ -1452,52 +2661,43 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { /* ========================================================================= */ #define UVH_INT_CMPB 0x22080UL +/* UVH common defines*/ #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL union uvh_int_cmpb_u { unsigned long v; + + /* UVH common struct */ struct uvh_int_cmpb_s { unsigned long real_time_cmpb:56; /* RW */ unsigned long rsvd_56_63:8; } s; -}; - -/* ========================================================================= */ -/* UVH_INT_CMPC */ -/* ========================================================================= */ -#define UVH_INT_CMPC 0x22100UL - - -#define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 -#define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL - -union uvh_int_cmpc_u { - unsigned long v; - struct uvh_int_cmpc_s { - unsigned long real_time_cmpc:56; /* RW */ + /* UV5 unique struct */ + struct uv5h_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ unsigned long rsvd_56_63:8; - } s; -}; + } s5; -/* ========================================================================= */ -/* UVH_INT_CMPD */ -/* ========================================================================= */ -#define UVH_INT_CMPD 0x22180UL - - -#define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 -#define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL + /* UV4 unique struct */ + struct uv4h_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ + unsigned long rsvd_56_63:8; + } s4; + /* UV3 unique struct */ + struct uv3h_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ + unsigned long rsvd_56_63:8; + } s3; -union uvh_int_cmpd_u { - unsigned long v; - struct uvh_int_cmpd_s { - unsigned long real_time_cmpd:56; /* RW */ + /* UV2 unique struct */ + struct uv2h_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ unsigned long rsvd_56_63:8; - } s; + } s2; }; /* ========================================================================= */ @@ -1505,28 +2705,23 @@ union uvh_int_cmpd_u { /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UV2H_IPI_INT_32 0x348 -#define UV3H_IPI_INT_32 0x348 -#define UV4H_IPI_INT_32 0x268 -#define UVH_IPI_INT_32 ( \ - is_uv2_hub() ? UV2H_IPI_INT_32 : \ - is_uv3_hub() ? UV3H_IPI_INT_32 : \ - /*is_uv4_hub*/ UV4H_IPI_INT_32) - +/* UVH common defines*/ #define UVH_IPI_INT_VECTOR_SHFT 0 -#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 -#define UVH_IPI_INT_DESTMODE_SHFT 11 -#define UVH_IPI_INT_APIC_ID_SHFT 16 -#define UVH_IPI_INT_SEND_SHFT 63 #define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL +#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 #define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL +#define UVH_IPI_INT_DESTMODE_SHFT 11 #define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL +#define UVH_IPI_INT_APIC_ID_SHFT 16 #define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL +#define UVH_IPI_INT_SEND_SHFT 63 #define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL union uvh_ipi_int_u { unsigned long v; + + /* UVH common struct */ struct uvh_ipi_int_s { unsigned long vector_:8; /* RW */ unsigned long delivery_mode:3; /* RW */ @@ -1536,903 +2731,105 @@ union uvh_ipi_int_u { unsigned long rsvd_48_62:15; unsigned long send:1; /* WP */ } s; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ -/* ========================================================================= */ -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST") -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST ( \ - is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ - is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ - /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST) -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 - - -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL - -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL - - -union uvh_lb_bau_intd_payload_queue_first_u { - unsigned long v; - struct uv2h_lb_bau_intd_payload_queue_first_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_48:6; - unsigned long node_id:14; /* RW */ - unsigned long rsvd_63:1; - } s2; - struct uv3h_lb_bau_intd_payload_queue_first_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_48:6; - unsigned long node_id:14; /* RW */ - unsigned long rsvd_63:1; - } s3; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ -/* ========================================================================= */ -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST") -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST ( \ - is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ - is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ - /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST) -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 - - -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL - -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL - - -union uvh_lb_bau_intd_payload_queue_last_u { - unsigned long v; - struct uv2h_lb_bau_intd_payload_queue_last_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s2; - struct uv3h_lb_bau_intd_payload_queue_last_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s3; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ -/* ========================================================================= */ -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL") -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL ( \ - is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ - is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ - /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL) -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 - - -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL - -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + /* UV5 unique struct */ + struct uv5h_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ + } s5; -union uvh_lb_bau_intd_payload_queue_tail_u { - unsigned long v; - struct uv2h_lb_bau_intd_payload_queue_tail_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s2; - struct uv3h_lb_bau_intd_payload_queue_tail_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s3; -}; + /* UV4 unique struct */ + struct uv4h_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ + } s4; -/* ========================================================================= */ -/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ -/* ========================================================================= */ -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE") -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE ( \ - is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ - is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ - /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE) -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 - - -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL - -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL - - -union uvh_lb_bau_intd_software_acknowledge_u { - unsigned long v; - struct uv2h_lb_bau_intd_software_acknowledge_s { - unsigned long pending_0:1; /* RW */ - unsigned long pending_1:1; /* RW */ - unsigned long pending_2:1; /* RW */ - unsigned long pending_3:1; /* RW */ - unsigned long pending_4:1; /* RW */ - unsigned long pending_5:1; /* RW */ - unsigned long pending_6:1; /* RW */ - unsigned long pending_7:1; /* RW */ - unsigned long timeout_0:1; /* RW */ - unsigned long timeout_1:1; /* RW */ - unsigned long timeout_2:1; /* RW */ - unsigned long timeout_3:1; /* RW */ - unsigned long timeout_4:1; /* RW */ - unsigned long timeout_5:1; /* RW */ - unsigned long timeout_6:1; /* RW */ - unsigned long timeout_7:1; /* RW */ - unsigned long rsvd_16_63:48; - } s2; - struct uv3h_lb_bau_intd_software_acknowledge_s { - unsigned long pending_0:1; /* RW */ - unsigned long pending_1:1; /* RW */ - unsigned long pending_2:1; /* RW */ - unsigned long pending_3:1; /* RW */ - unsigned long pending_4:1; /* RW */ - unsigned long pending_5:1; /* RW */ - unsigned long pending_6:1; /* RW */ - unsigned long pending_7:1; /* RW */ - unsigned long timeout_0:1; /* RW */ - unsigned long timeout_1:1; /* RW */ - unsigned long timeout_2:1; /* RW */ - unsigned long timeout_3:1; /* RW */ - unsigned long timeout_4:1; /* RW */ - unsigned long timeout_5:1; /* RW */ - unsigned long timeout_6:1; /* RW */ - unsigned long timeout_7:1; /* RW */ - unsigned long rsvd_16_63:48; + /* UV3 unique struct */ + struct uv3h_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ } s3; -}; -/* ========================================================================= */ -/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ -/* ========================================================================= */ -#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL -#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL -#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS") -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS ( \ - is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ - is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ - /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS) -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 - - -/* ========================================================================= */ -/* UVH_LB_BAU_MISC_CONTROL */ -/* ========================================================================= */ -#define UV2H_LB_BAU_MISC_CONTROL 0x320170UL -#define UV3H_LB_BAU_MISC_CONTROL 0x320170UL -#define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL -#define UVH_LB_BAU_MISC_CONTROL ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL) - -#define UV2H_LB_BAU_MISC_CONTROL_32 0xa10 -#define UV3H_LB_BAU_MISC_CONTROL_32 0xa10 -#define UV4H_LB_BAU_MISC_CONTROL_32 0xa18 -#define UVH_LB_BAU_MISC_CONTROL_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32) - -#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 -#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 -#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 -#define UVXH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL -#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL -#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL -#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL -#define UVXH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 -#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 -#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL -#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL -#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 -#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 -#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT 37 -#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 -#define UV3H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL -#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL -#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL -#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL -#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_MASK 0x0000002000000000UL -#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL -#define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_SHFT 15 -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 -#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 -#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 -#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_SHFT 37 -#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 -#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_SHFT 46 -#define UV4H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_MASK 0x00000000000f8000UL -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL -#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL -#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL -#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL -#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_MASK 0x0000002000000000UL -#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL -#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_MASK 0x0000400000000000UL -#define UV4H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK \ - uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK") -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK) -#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT \ - uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT") -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT) -#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK \ - uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK") -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK) -#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT \ - uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT") -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT ( \ - is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ - is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ - /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT) - -union uvh_lb_bau_misc_control_u { - unsigned long v; - struct uvh_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long rsvd_15_19:5; - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long rsvd_29_47:19; - unsigned long fun:16; /* RW */ - } s; - struct uvxh_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long rsvd_15_19:5; - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long enable_automatic_apic_mode_selection:1;/* RW */ - unsigned long apic_mode_status:1; /* RO */ - unsigned long suppress_interrupts_to_self:1; /* RW */ - unsigned long enable_lock_based_system_flush:1;/* RW */ - unsigned long enable_extended_sb_status:1; /* RW */ - unsigned long suppress_int_prio_udt_to_self:1;/* RW */ - unsigned long use_legacy_descriptor_formats:1;/* RW */ - unsigned long rsvd_36_47:12; - unsigned long fun:16; /* RW */ - } sx; - struct uv2h_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long enable_automatic_apic_mode_selection:1;/* RW */ - unsigned long apic_mode_status:1; /* RO */ - unsigned long suppress_interrupts_to_self:1; /* RW */ - unsigned long enable_lock_based_system_flush:1;/* RW */ - unsigned long enable_extended_sb_status:1; /* RW */ - unsigned long suppress_int_prio_udt_to_self:1;/* RW */ - unsigned long use_legacy_descriptor_formats:1;/* RW */ - unsigned long rsvd_36_47:12; - unsigned long fun:16; /* RW */ + /* UV2 unique struct */ + struct uv2h_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ } s2; - struct uv3h_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long enable_automatic_apic_mode_selection:1;/* RW */ - unsigned long apic_mode_status:1; /* RO */ - unsigned long suppress_interrupts_to_self:1; /* RW */ - unsigned long enable_lock_based_system_flush:1;/* RW */ - unsigned long enable_extended_sb_status:1; /* RW */ - unsigned long suppress_int_prio_udt_to_self:1;/* RW */ - unsigned long use_legacy_descriptor_formats:1;/* RW */ - unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ - unsigned long enable_intd_prefetch_hint:1; /* RW */ - unsigned long thread_kill_timebase:8; /* RW */ - unsigned long rsvd_46_47:2; - unsigned long fun:16; /* RW */ - } s3; - struct uv4h_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long rsvd_15_19:5; - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long enable_automatic_apic_mode_selection:1;/* RW */ - unsigned long apic_mode_status:1; /* RO */ - unsigned long suppress_interrupts_to_self:1; /* RW */ - unsigned long enable_lock_based_system_flush:1;/* RW */ - unsigned long enable_extended_sb_status:1; /* RW */ - unsigned long suppress_int_prio_udt_to_self:1;/* RW */ - unsigned long use_legacy_descriptor_formats:1;/* RW */ - unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ - unsigned long rsvd_37:1; - unsigned long thread_kill_timebase:8; /* RW */ - unsigned long address_interleave_select:1; /* RW */ - unsigned long rsvd_47:1; - unsigned long fun:16; /* RW */ - } s4; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ -/* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL) - -#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 -#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 -#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32) - -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL - - -union uvh_lb_bau_sb_activation_control_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_control_s { - unsigned long index:6; /* RW */ - unsigned long rsvd_6_61:56; - unsigned long push:1; /* WP */ - unsigned long init:1; /* WP */ - } s; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ -/* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0) - -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32) - -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL - - -union uvh_lb_bau_sb_activation_status_0_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_0_s { - unsigned long status:64; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ -/* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1) - -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32) - -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL - - -union uvh_lb_bau_sb_activation_status_1_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_1_s { - unsigned long status:64; /* RW */ - } s; }; /* ========================================================================= */ -/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ -/* ========================================================================= */ -#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE) - -#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 -#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 -#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32) - -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 - -#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL -#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL - -#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL -#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL - -#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL -#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL - -#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 53 -#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000ffffffffff000UL -#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0xffe0000000000000UL - -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ - is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT) - -#define UVH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ - is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK) - -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ - is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK) - -/* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL -#define UV2H_NODE_ID 0x0UL -#define UV3H_NODE_ID 0x0UL -#define UV4H_NODE_ID 0x0UL +/* UVH common defines*/ #define UVH_NODE_ID_FORCE1_SHFT 0 -#define UVH_NODE_ID_MANUFACTURER_SHFT 1 -#define UVH_NODE_ID_PART_NUMBER_SHFT 12 -#define UVH_NODE_ID_REVISION_SHFT 28 -#define UVH_NODE_ID_NODE_ID_SHFT 32 #define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVH_NODE_ID_MANUFACTURER_SHFT 1 #define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVH_NODE_ID_PART_NUMBER_SHFT 12 #define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVH_NODE_ID_REVISION_SHFT 28 #define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UVH_NODE_ID_NODE_ID_SHFT 32 +#define UVH_NODE_ID_NI_PORT_SHFT 57 -#define UVXH_NODE_ID_FORCE1_SHFT 0 -#define UVXH_NODE_ID_MANUFACTURER_SHFT 1 -#define UVXH_NODE_ID_PART_NUMBER_SHFT 12 -#define UVXH_NODE_ID_REVISION_SHFT 28 -#define UVXH_NODE_ID_NODE_ID_SHFT 32 -#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50 -#define UVXH_NODE_ID_NI_PORT_SHFT 57 -#define UVXH_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UVXH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UVXH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UVXH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +/* UVXH common defines */ #define UVXH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50 #define UVXH_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL #define UVXH_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL -#define UV2H_NODE_ID_FORCE1_SHFT 0 -#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV2H_NODE_ID_REVISION_SHFT 28 -#define UV2H_NODE_ID_NODE_ID_SHFT 32 -#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 -#define UV2H_NODE_ID_NI_PORT_SHFT 57 -#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL -#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL - -#define UV3H_NODE_ID_FORCE1_SHFT 0 -#define UV3H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV3H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV3H_NODE_ID_REVISION_SHFT 28 -#define UV3H_NODE_ID_NODE_ID_SHFT 32 -#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48 -#define UV3H_NODE_ID_RESERVED_2_SHFT 49 -#define UV3H_NODE_ID_NODES_PER_BIT_SHFT 50 -#define UV3H_NODE_ID_NI_PORT_SHFT 57 -#define UV3H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV3H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV3H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV3H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV3H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL -#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL -#define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL -#define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL - -#define UV4H_NODE_ID_FORCE1_SHFT 0 -#define UV4H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV4H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV4H_NODE_ID_REVISION_SHFT 28 -#define UV4H_NODE_ID_NODE_ID_SHFT 32 +/* UVYH common defines */ +#define UVYH_NODE_ID_NODE_ID_MASK 0x0000007f00000000UL +#define UVYH_NODE_ID_NI_PORT_MASK 0x7e00000000000000UL + +/* UV4 unique defines */ #define UV4H_NODE_ID_ROUTER_SELECT_SHFT 48 -#define UV4H_NODE_ID_RESERVED_2_SHFT 49 -#define UV4H_NODE_ID_NODES_PER_BIT_SHFT 50 -#define UV4H_NODE_ID_NI_PORT_SHFT 57 -#define UV4H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV4H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV4H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV4H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV4H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL #define UV4H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV4H_NODE_ID_RESERVED_2_SHFT 49 #define UV4H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL -#define UV4H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL -#define UV4H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + +/* UV3 unique defines */ +#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48 +#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV3H_NODE_ID_RESERVED_2_SHFT 49 +#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL union uvh_node_id_u { unsigned long v; + + /* UVH common struct */ struct uvh_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ unsigned long part_number:16; /* RO */ unsigned long revision:4; /* RO */ - unsigned long node_id:15; /* RW */ - unsigned long rsvd_47_63:17; + unsigned long rsvd_32_63:32; } s; + + /* UVXH common struct */ struct uvxh_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ @@ -2444,17 +2841,47 @@ union uvh_node_id_u { unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; } sx; - struct uv2h_node_id_s { + + /* UVYH common struct */ + struct uvyh_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:7; /* RW */ + unsigned long rsvd_39_56:18; + unsigned long ni_port:6; /* RO */ + unsigned long rsvd_63:1; + } sy; + + /* UV5 unique struct */ + struct uv5h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:7; /* RW */ + unsigned long rsvd_39_56:18; + unsigned long ni_port:6; /* RO */ + unsigned long rsvd_63:1; + } s5; + + /* UV4 unique struct */ + struct uv4h_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ unsigned long part_number:16; /* RO */ unsigned long revision:4; /* RO */ unsigned long node_id:15; /* RW */ - unsigned long rsvd_47_49:3; + unsigned long rsvd_47:1; + unsigned long router_select:1; /* RO */ + unsigned long rsvd_49:1; unsigned long nodes_per_bit:7; /* RO */ unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; - } s2; + } s4; + + /* UV3 unique struct */ struct uv3h_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ @@ -2468,186 +2895,569 @@ union uvh_node_id_u { unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; } s3; - struct uv4h_node_id_s { + + /* UV2 unique struct */ + struct uv2h_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ unsigned long part_number:16; /* RO */ unsigned long revision:4; /* RO */ unsigned long node_id:15; /* RW */ - unsigned long rsvd_47:1; - unsigned long router_select:1; /* RO */ - unsigned long rsvd_49:1; + unsigned long rsvd_47_49:3; unsigned long nodes_per_bit:7; /* RO */ unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; - } s4; + } s2; +}; + +/* ========================================================================= */ +/* UVH_NODE_PRESENT_0 */ +/* ========================================================================= */ +#define UVH_NODE_PRESENT_0 ( \ + is_uv(UV5) ? 0x1400UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_NODE_PRESENT_0_NODES_SHFT 0 +#define UVYH_NODE_PRESENT_0_NODES_MASK 0xffffffffffffffffUL + + +union uvh_node_present_0_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_node_present_0_s { + unsigned long nodes:64; /* RW */ + } s; + + /* UVYH common struct */ + struct uvyh_node_present_0_s { + unsigned long nodes:64; /* RW */ + } sy; + + /* UV5 unique struct */ + struct uv5h_node_present_0_s { + unsigned long nodes:64; /* RW */ + } s5; +}; + +/* ========================================================================= */ +/* UVH_NODE_PRESENT_1 */ +/* ========================================================================= */ +#define UVH_NODE_PRESENT_1 ( \ + is_uv(UV5) ? 0x1408UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_NODE_PRESENT_1_NODES_SHFT 0 +#define UVYH_NODE_PRESENT_1_NODES_MASK 0xffffffffffffffffUL + + +union uvh_node_present_1_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_node_present_1_s { + unsigned long nodes:64; /* RW */ + } s; + + /* UVYH common struct */ + struct uvyh_node_present_1_s { + unsigned long nodes:64; /* RW */ + } sy; + + /* UV5 unique struct */ + struct uv5h_node_present_1_s { + unsigned long nodes:64; /* RW */ + } s5; }; /* ========================================================================= */ /* UVH_NODE_PRESENT_TABLE */ /* ========================================================================= */ -#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE ( \ + is_uv(UV4) ? 0x1400UL : \ + is_uv(UV3) ? 0x1400UL : \ + is_uv(UV2) ? 0x1400UL : \ + 0) -#define UV2H_NODE_PRESENT_TABLE_DEPTH 16 -#define UV3H_NODE_PRESENT_TABLE_DEPTH 16 -#define UV4H_NODE_PRESENT_TABLE_DEPTH 4 #define UVH_NODE_PRESENT_TABLE_DEPTH ( \ - is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH : \ - is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH : \ - /*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH) + is_uv(UV4) ? 4 : \ + is_uv(UV3) ? 16 : \ + is_uv(UV2) ? 16 : \ + 0) -#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 -#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + +/* UVXH common defines */ +#define UVXH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVXH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL union uvh_node_present_table_u { unsigned long v; + + /* UVH common struct */ struct uvh_node_present_table_s { unsigned long nodes:64; /* RW */ } s; + + /* UVXH common struct */ + struct uvxh_node_present_table_s { + unsigned long nodes:64; /* RW */ + } sx; + + /* UV4 unique struct */ + struct uv4h_node_present_table_s { + unsigned long nodes:64; /* RW */ + } s4; + + /* UV3 unique struct */ + struct uv3h_node_present_table_s { + unsigned long nodes:64; /* RW */ + } s3; + + /* UV2 unique struct */ + struct uv2h_node_present_table_s { + unsigned long nodes:64; /* RW */ + } s2; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_ADDR_MAP_CONFIG */ +/* ========================================================================= */ +#define UVH_RH10_GAM_ADDR_MAP_CONFIG ( \ + is_uv(UV5) ? 0x470000UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_N_SKT_SHFT 6 +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_N_SKT_MASK 0x00000000000001c0UL +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_LS_ENABLE_SHFT 12 +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_LS_ENABLE_MASK 0x0000000000001000UL +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_MK_TME_KEYID_BITS_SHFT 16 +#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_MK_TME_KEYID_BITS_MASK 0x00000000000f0000UL + + +union uvh_rh10_gam_addr_map_config_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_addr_map_config_s { + unsigned long undef_0_5:6; /* Undefined */ + unsigned long n_skt:3; /* RW */ + unsigned long undef_9_11:3; /* Undefined */ + unsigned long ls_enable:1; /* RW */ + unsigned long undef_13_15:3; /* Undefined */ + unsigned long mk_tme_keyid_bits:4; /* RW */ + unsigned long rsvd_20_63:44; + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_addr_map_config_s { + unsigned long undef_0_5:6; /* Undefined */ + unsigned long n_skt:3; /* RW */ + unsigned long undef_9_11:3; /* Undefined */ + unsigned long ls_enable:1; /* RW */ + unsigned long undef_13_15:3; /* Undefined */ + unsigned long mk_tme_keyid_bits:4; /* RW */ + unsigned long rsvd_20_63:44; + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_addr_map_config_s { + unsigned long undef_0_5:6; /* Undefined */ + unsigned long n_skt:3; /* RW */ + unsigned long undef_9_11:3; /* Undefined */ + unsigned long ls_enable:1; /* RW */ + unsigned long undef_13_15:3; /* Undefined */ + unsigned long mk_tme_keyid_bits:4; /* RW */ + } s5; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ +/* UVH_RH10_GAM_GRU_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR) - -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - - -union uvh_rh_gam_alias210_overlay_config_0_mmr_u { +#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG ( \ + is_uv(UV5) ? 0x4700b0UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 25 +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffe000000UL +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_N_GRU_SHFT 52 +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_N_GRU_MASK 0x0070000000000000UL +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK ( \ + is_uv(UV5) ? 0x000ffffffe000000UL : \ + 0) +#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT ( \ + is_uv(UV5) ? 25 : \ + -1) + +union uvh_rh10_gam_gru_overlay_config_u { unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; + + /* UVH common struct */ + struct uvh_rh10_gam_gru_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long n_gru:3; /* RW */ + unsigned long undef_55_62:8; /* Undefined */ unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; + + /* UVYH common struct */ + struct uvyh_rh10_gam_gru_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long n_gru:3; /* RW */ + unsigned long undef_55_62:8; /* Undefined */ unsigned long enable:1; /* RW */ - } sx; - struct uv2h_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_gru_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long n_gru:3; /* RW */ + unsigned long undef_55_62:8; /* Undefined */ unsigned long enable:1; /* RW */ - } s2; - struct uv3h_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0 */ +/* ========================================================================= */ +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0 ( \ + is_uv(UV5) ? 0x473000UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x000ffffffc000000UL +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 52 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x03f0000000000000UL +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK ( \ + is_uv(UV5) ? 0x000ffffffc000000UL : \ + 0) +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT ( \ + is_uv(UV5) ? 26 : \ + -1) + +union uvh_rh10_gam_mmioh_overlay_config0_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_mmioh_overlay_config0_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ unsigned long enable:1; /* RW */ - } s3; - struct uv4h_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_mmioh_overlay_config0_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ unsigned long enable:1; /* RW */ + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_mmioh_overlay_config0_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1 */ +/* ========================================================================= */ +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1 ( \ + is_uv(UV5) ? 0x474000UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x000ffffffc000000UL +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 52 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x03f0000000000000UL +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63 +#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK ( \ + is_uv(UV5) ? 0x000ffffffc000000UL : \ + 0) +#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT ( \ + is_uv(UV5) ? 26 : \ + -1) + +union uvh_rh10_gam_mmioh_overlay_config1_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_mmioh_overlay_config1_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_mmioh_overlay_config1_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_mmioh_overlay_config1_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0 */ +/* ========================================================================= */ +#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0 ( \ + is_uv(UV5) ? 0x473800UL : \ + 0) + +#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH ( \ + is_uv(UV5) ? 128 : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 +#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x000000000000007fUL + + +union uvh_rh10_gam_mmioh_redirect_config0_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_mmioh_redirect_config0_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_mmioh_redirect_config0_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_mmioh_redirect_config0_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1 */ +/* ========================================================================= */ +#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1 ( \ + is_uv(UV5) ? 0x474800UL : \ + 0) + +#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH ( \ + is_uv(UV5) ? 128 : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 +#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x000000000000007fUL + + +union uvh_rh10_gam_mmioh_redirect_config1_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_mmioh_redirect_config1_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_mmioh_redirect_config1_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_mmioh_redirect_config1_s { + unsigned long nasid:7; /* RW */ + unsigned long rsvd_7_63:57; + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH10_GAM_MMR_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG ( \ + is_uv(UV5) ? 0x470090UL : \ + 0) + + +/* UVYH common defines */ +#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT 25 +#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_MASK 0x000ffffffe000000UL +#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \ + is_uv(UV5) ? 0x000ffffffe000000UL : \ + 0) +#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT ( \ + is_uv(UV5) ? 25 : \ + -1) + +union uvh_rh10_gam_mmr_overlay_config_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh10_gam_mmr_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long undef_52_62:11; /* Undefined */ + unsigned long enable:1; /* RW */ + } s; + + /* UVYH common struct */ + struct uvyh_rh10_gam_mmr_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long undef_52_62:11; /* Undefined */ + unsigned long enable:1; /* RW */ + } sy; + + /* UV5 unique struct */ + struct uv5h_rh10_gam_mmr_overlay_config_s { + unsigned long undef_0_24:25; /* Undefined */ + unsigned long base:27; /* RW */ + unsigned long undef_52_62:11; /* Undefined */ + unsigned long enable:1; /* RW */ + } s5; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ADDR_MAP_CONFIG */ +/* ========================================================================= */ +#define UVH_RH_GAM_ADDR_MAP_CONFIG ( \ + is_uv(UV4) ? 0x480000UL : \ + is_uv(UV3) ? 0x1600000UL : \ + is_uv(UV2) ? 0x1600000UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_ADDR_MAP_CONFIG_N_SKT_SHFT 6 +#define UVXH_RH_GAM_ADDR_MAP_CONFIG_N_SKT_MASK 0x00000000000003c0UL + +/* UV3 unique defines */ +#define UV3H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_SHFT 0 +#define UV3H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL + +/* UV2 unique defines */ +#define UV2H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_SHFT 0 +#define UV2H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL + + +union uvh_rh_gam_addr_map_config_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh_gam_addr_map_config_s { + unsigned long rsvd_0_5:6; + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_addr_map_config_s { + unsigned long rsvd_0_5:6; + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } sx; + + /* UV4 unique struct */ + struct uv4h_rh_gam_addr_map_config_s { + unsigned long rsvd_0_5:6; + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_addr_map_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s3; + + /* UV2 unique struct */ + struct uv2h_rh_gam_addr_map_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ +/* UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR) - -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - - -union uvh_rh_gam_alias210_overlay_config_1_mmr_u { +#define UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG ( \ + is_uv(UV4) ? 0x4800c8UL : \ + is_uv(UV3) ? 0x16000c8UL : \ + is_uv(UV2) ? 0x16000c8UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + + +union uvh_rh_gam_alias_0_overlay_config_u { unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_alias_0_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2655,7 +3465,9 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_alias210_overlay_config_1_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_0_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2663,15 +3475,19 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } sx; - struct uv2h_rh_gam_alias210_overlay_config_1_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_0_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; unsigned long m_alias:5; /* RW */ unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ - } s2; - struct uv3h_rh_gam_alias210_overlay_config_1_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_0_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2679,66 +3495,96 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s3; - struct uv4h_rh_gam_alias210_overlay_config_1_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_0_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; unsigned long m_alias:5; /* RW */ unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ + } s2; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG ( \ + is_uv(UV4) ? 0x4800d0UL : \ + is_uv(UV3) ? 0x16000d0UL : \ + is_uv(UV2) ? 0x16000d0UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL + + +union uvh_rh_gam_alias_0_redirect_config_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh_gam_alias_0_redirect_config_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_0_redirect_config_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } sx; + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_0_redirect_config_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_0_redirect_config_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s3; + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_0_redirect_config_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ +/* UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR) - -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - - -union uvh_rh_gam_alias210_overlay_config_2_mmr_u { +#define UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG ( \ + is_uv(UV4) ? 0x4800d8UL : \ + is_uv(UV3) ? 0x16000d8UL : \ + is_uv(UV2) ? 0x16000d8UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + + +union uvh_rh_gam_alias_1_overlay_config_u { unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_alias_1_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2746,7 +3592,9 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_alias210_overlay_config_2_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_1_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2754,15 +3602,19 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } sx; - struct uv2h_rh_gam_alias210_overlay_config_2_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_1_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; unsigned long m_alias:5; /* RW */ unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ - } s2; - struct uv3h_rh_gam_alias210_overlay_config_2_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_1_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; @@ -2770,321 +3622,289 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s3; - struct uv4h_rh_gam_alias210_overlay_config_2_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_1_overlay_config_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ unsigned long rsvd_32_47:16; unsigned long m_alias:5; /* RW */ unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ - } s4; + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ +/* UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR) - -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG ( \ + is_uv(UV4) ? 0x4800e0UL : \ + is_uv(UV3) ? 0x16000e0UL : \ + is_uv(UV2) ? 0x16000e0UL : \ + 0) -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_1_REDIRECT_CONFIG_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_1_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL -union uvh_rh_gam_alias210_redirect_config_0_mmr_u { +union uvh_rh_gam_alias_1_redirect_config_u { unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_alias_1_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; - struct uvxh_rh_gam_alias210_redirect_config_0_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_1_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } sx; - struct uv2h_rh_gam_alias210_redirect_config_0_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_1_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; - } s2; - struct uv3h_rh_gam_alias210_redirect_config_0_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_1_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s3; - struct uv4h_rh_gam_alias210_redirect_config_0_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_1_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; - } s4; + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ +/* UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR) - -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG ( \ + is_uv(UV4) ? 0x4800e8UL : \ + is_uv(UV3) ? 0x16000e8UL : \ + is_uv(UV2) ? 0x16000e8UL : \ + 0) -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -union uvh_rh_gam_alias210_redirect_config_1_mmr_u { +union uvh_rh_gam_alias_2_overlay_config_u { unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_alias_2_overlay_config_s { unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_alias210_redirect_config_1_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_2_overlay_config_s { unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ } sx; - struct uv2h_rh_gam_alias210_redirect_config_1_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_2_overlay_config_s { unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; - } s2; - struct uv3h_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_2_overlay_config_s { unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ } s3; - struct uv4h_rh_gam_alias210_redirect_config_1_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_2_overlay_config_s { unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; - } s4; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ +/* UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR) - -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL - -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG ( \ + is_uv(UV4) ? 0x4800f0UL : \ + is_uv(UV3) ? 0x16000f0UL : \ + is_uv(UV2) ? 0x16000f0UL : \ + 0) -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL +/* UVXH common defines */ +#define UVXH_RH_GAM_ALIAS_2_REDIRECT_CONFIG_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS_2_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL -union uvh_rh_gam_alias210_redirect_config_2_mmr_u { +union uvh_rh_gam_alias_2_redirect_config_u { unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_alias_2_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; - struct uvxh_rh_gam_alias210_redirect_config_2_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_alias_2_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } sx; - struct uv2h_rh_gam_alias210_redirect_config_2_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_alias_2_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; - } s2; - struct uv3h_rh_gam_alias210_redirect_config_2_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_alias_2_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s3; - struct uv4h_rh_gam_alias210_redirect_config_2_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_alias_2_redirect_config_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; - } s4; -}; - -/* ========================================================================= */ -/* UVH_RH_GAM_CONFIG_MMR */ -/* ========================================================================= */ -#define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL -#define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL -#define UV4H_RH_GAM_CONFIG_MMR 0x480000UL -#define UVH_RH_GAM_CONFIG_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR) - -#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL - -#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL - -#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL - -#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL - -#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL - - -union uvh_rh_gam_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_config_mmr_s { - unsigned long rsvd_0_5:6; - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_63:54; - } s; - struct uvxh_rh_gam_config_mmr_s { - unsigned long rsvd_0_5:6; - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_63:54; - } sx; - struct uv2h_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_63:54; } s2; - struct uv3h_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_63:54; - } s3; - struct uv4h_rh_gam_config_mmr_s { - unsigned long rsvd_0_5:6; - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_63:54; - } s4; }; /* ========================================================================= */ -/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ +/* UVH_RH_GAM_GRU_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR) - -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_SHFT 62 -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL -#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK ( \ - is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ - is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK) -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT ( \ - is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ - is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ - /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT) - -union uvh_rh_gam_gru_overlay_config_mmr_u { +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG ( \ + is_uv(UV4) ? 0x480010UL : \ + is_uv(UV3) ? 0x1600010UL : \ + is_uv(UV2) ? 0x1600010UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_N_GRU_SHFT 52 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_N_GRU_MASK 0x00f0000000000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +/* UV4A unique defines */ +#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26 +#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffc000000UL + +/* UV4 unique defines */ +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffffc000000UL + +/* UV3 unique defines */ +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 28 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffff0000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MODE_SHFT 62 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MODE_MASK 0x4000000000000000UL + +/* UV2 unique defines */ +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 28 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffff0000000UL + +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK ( \ + is_uv(UV4A) ? 0x000ffffffc000000UL : \ + is_uv(UV4) ? 0x00003ffffc000000UL : \ + is_uv(UV3) ? 0x00003ffff0000000UL : \ + is_uv(UV2) ? 0x00003ffff0000000UL : \ + 0) +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT ( \ + is_uv(UV4) ? 26 : \ + is_uv(UV3) ? 28 : \ + is_uv(UV2) ? 28 : \ + -1) + +union uvh_rh_gam_gru_overlay_config_u { unsigned long v; - struct uvh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_51:52; + + /* UVH common struct */ + struct uvh_rh_gam_gru_overlay_config_s { + unsigned long rsvd_0_45:46; + unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_gru_overlay_config_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_gru_overlay_config_s { unsigned long rsvd_0_45:46; unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } sx; - struct uv2h_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ + + /* UV4A unique struct */ + struct uv4ah_rh_gam_gru_overlay_config_s { + unsigned long rsvd_0_24:25; + unsigned long undef_25:1; /* Undefined */ + unsigned long base:26; /* RW */ + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4a; + + /* UV4 unique struct */ + struct uv4h_rh_gam_gru_overlay_config_s { + unsigned long rsvd_0_24:25; + unsigned long undef_25:1; /* Undefined */ + unsigned long base:20; /* RW */ unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s2; - struct uv3h_rh_gam_gru_overlay_config_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_gru_overlay_config_s { unsigned long rsvd_0_27:28; unsigned long base:18; /* RW */ unsigned long rsvd_46_51:6; @@ -3093,86 +3913,141 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long mode:1; /* RW */ unsigned long enable:1; /* RW */ } s3; - struct uv4h_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_24:25; - unsigned long undef_25:1; /* Undefined */ - unsigned long base:20; /* RW */ + + /* UV2 unique struct */ + struct uv2h_rh_gam_gru_overlay_config_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s4; + } s2; }; /* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR") -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x483000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR) - - -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 52 -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x000ffffffc000000UL -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x03f0000000000000UL -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT) - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK) - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK) - -union uvh_rh_gam_mmioh_overlay_config0_mmr_u { +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG ( \ + is_uv(UV2) ? 0x1600030UL : \ + 0) + + + +/* UV2 unique defines */ +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT 27 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_MASK 0x00003ffff8000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_M_IO_SHFT 46 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_M_IO_MASK 0x000fc00000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_N_IO_SHFT 52 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_N_IO_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT ( \ + is_uv(UV2) ? 27 : \ + uv_undefined("UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT")) + +union uvh_rh_gam_mmioh_overlay_config_u { unsigned long v; - struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_mmioh_overlay_config_s { + unsigned long rsvd_0_26:27; + unsigned long base:19; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_mmioh_overlay_config_s { + unsigned long rsvd_0_26:27; + unsigned long base:19; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } sx; + + /* UV2 unique struct */ + struct uv2h_rh_gam_mmioh_overlay_config_s { + unsigned long rsvd_0_26:27; + unsigned long base:19; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s2; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0 */ +/* ========================================================================= */ +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0 ( \ + is_uv(UV4) ? 0x483000UL : \ + is_uv(UV3) ? 0x1603000UL : \ + 0) + +/* UV4A unique defines */ +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x000ffffffc000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 52 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x03f0000000000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL + +/* UV4 unique defines */ +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 46 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x000fc00000000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL + +/* UV3 unique defines */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK ( \ + is_uv(UV4A) ? 0x000ffffffc000000UL : \ + is_uv(UV4) ? 0x00003ffffc000000UL : \ + is_uv(UV3) ? 0x00003ffffc000000UL : \ + 0) +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT ( \ + is_uv(UV4) ? 26 : \ + is_uv(UV3) ? 26 : \ + -1) + +union uvh_rh_gam_mmioh_overlay_config0_u { + unsigned long v; + + /* UVH common struct */ + struct uvh_rh_gam_mmioh_overlay_config0_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long m_io:6; /* RW */ unsigned long n_io:4; unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s3; - struct uv4h_rh_gam_mmioh_overlay_config0_mmr_s { + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_mmioh_overlay_config0_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long m_io:6; /* RW */ unsigned long n_io:4; unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s4; + } sx; + + /* UV4A unique struct */ struct uv4ah_rh_gam_mmioh_overlay_config0_mmr_s { unsigned long rsvd_0_25:26; unsigned long base:26; /* RW */ @@ -3181,71 +4056,94 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { unsigned long undef_62:1; /* Undefined */ unsigned long enable:1; /* RW */ } s4a; + + /* UV4 unique struct */ + struct uv4h_rh_gam_mmioh_overlay_config0_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_mmioh_overlay_config0_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1 */ /* ========================================================================= */ -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1603000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR) - - -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 52 -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x000ffffffc000000UL -#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x03f0000000000000UL - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT) - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK) - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) - -union uvh_rh_gam_mmioh_overlay_config1_mmr_u { +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1 ( \ + is_uv(UV4) ? 0x484000UL : \ + is_uv(UV3) ? 0x1604000UL : \ + 0) + +/* UV4A unique defines */ +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x000ffffffc000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 52 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x03f0000000000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL + +/* UV4 unique defines */ +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 46 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x000fc00000000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL + +/* UV3 unique defines */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK ( \ + is_uv(UV4A) ? 0x000ffffffc000000UL : \ + is_uv(UV4) ? 0x00003ffffc000000UL : \ + is_uv(UV3) ? 0x00003ffffc000000UL : \ + 0) +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT ( \ + is_uv(UV4) ? 26 : \ + is_uv(UV3) ? 26 : \ + -1) + +union uvh_rh_gam_mmioh_overlay_config1_u { unsigned long v; - struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_mmioh_overlay_config1_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long m_io:6; /* RW */ unsigned long n_io:4; unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s3; - struct uv4h_rh_gam_mmioh_overlay_config1_mmr_s { + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_mmioh_overlay_config1_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long m_io:6; /* RW */ unsigned long n_io:4; unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s4; + } sx; + + /* UV4A unique struct */ struct uv4ah_rh_gam_mmioh_overlay_config1_mmr_s { unsigned long rsvd_0_25:26; unsigned long base:26; /* RW */ @@ -3254,232 +4152,275 @@ union uvh_rh_gam_mmioh_overlay_config1_mmr_u { unsigned long undef_62:1; /* Undefined */ unsigned long enable:1; /* RW */ } s4a; -}; -/* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ -/* ========================================================================= */ -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR) - - -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - - -union uvh_rh_gam_mmioh_overlay_config_mmr_u { - unsigned long v; - struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_26:27; - unsigned long base:19; /* RW */ + /* UV4 unique struct */ + struct uv4h_rh_gam_mmioh_overlay_config1_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ unsigned long m_io:6; /* RW */ - unsigned long n_io:4; /* RW */ + unsigned long n_io:4; unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ - } s2; + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_mmioh_overlay_config1_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ +/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0 */ /* ========================================================================= */ -#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR") -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x483800UL -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR) +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0 ( \ + is_uv(UV4) ? 0x483800UL : \ + is_uv(UV3) ? 0x1603800UL : \ + 0) -#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH") -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH) +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH ( \ + is_uv(UV4) ? 128 : \ + is_uv(UV3) ? 128 : \ + 0) +/* UV4A unique defines */ +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000000fffUL -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL +/* UV4 unique defines */ +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL +/* UV3 unique defines */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000000fffUL -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK) - -union uvh_rh_gam_mmioh_redirect_config0_mmr_u { +union uvh_rh_gam_mmioh_redirect_config0_u { unsigned long v; - struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_mmioh_redirect_config0_s { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; - } s3; - struct uv4h_rh_gam_mmioh_redirect_config0_mmr_s { + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_mmioh_redirect_config0_s { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; - } s4; - struct uv4ah_rh_gam_mmioh_redirect_config0_mmr_s { + } sx; + + struct uv4ah_rh_gam_mmioh_redirect_config0_s { unsigned long nasid:12; /* RW */ unsigned long rsvd_12_63:52; } s4a; + + /* UV4 unique struct */ + struct uv4h_rh_gam_mmioh_redirect_config0_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_mmioh_redirect_config0_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; }; /* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ +/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1 */ /* ========================================================================= */ -#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR") -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x484800UL -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR) - -#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH") -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH) +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1 ( \ + is_uv(UV4) ? 0x484800UL : \ + is_uv(UV3) ? 0x1604800UL : \ + 0) +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH ( \ + is_uv(UV4) ? 128 : \ + is_uv(UV3) ? 128 : \ + 0) -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL +/* UV4A unique defines */ +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000000fffUL -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 -#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL +/* UV4 unique defines */ +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000000fffUL +/* UV3 unique defines */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL -#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK ( \ - is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \ - is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK) -union uvh_rh_gam_mmioh_redirect_config1_mmr_u { +union uvh_rh_gam_mmioh_redirect_config1_u { unsigned long v; - struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_mmioh_redirect_config1_s { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; - } s3; - struct uv4h_rh_gam_mmioh_redirect_config1_mmr_s { + } s; + + /* UVXH common struct */ + struct uvxh_rh_gam_mmioh_redirect_config1_s { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; - } s4; - struct uv4ah_rh_gam_mmioh_redirect_config1_mmr_s { + } sx; + + struct uv4ah_rh_gam_mmioh_redirect_config1_s { unsigned long nasid:12; /* RW */ unsigned long rsvd_12_63:52; } s4a; + + /* UV4 unique struct */ + struct uv4h_rh_gam_mmioh_redirect_config1_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_mmioh_redirect_config1_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; }; /* ========================================================================= */ -/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ +/* UVH_RH_GAM_MMR_OVERLAY_CONFIG */ /* ========================================================================= */ -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL -#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL -#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR ( \ - is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ - is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ - /*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR) - -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - - -union uvh_rh_gam_mmr_overlay_config_mmr_u { +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG ( \ + is_uv(UV4) ? 0x480028UL : \ + is_uv(UV3) ? 0x1600028UL : \ + is_uv(UV2) ? 0x1600028UL : \ + 0) + + +/* UVXH common defines */ +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT 26 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \ + is_uv(UV4A) ? 0x000ffffffc000000UL : \ + is_uv(UV4) ? 0x00003ffffc000000UL : \ + is_uv(UV3) ? 0x00003ffffc000000UL : \ + is_uv(UV2) ? 0x00003ffffc000000UL : \ + 0) +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +/* UV4A unique defines */ +#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26 +#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffc000000UL + +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \ + is_uv(UV4A) ? 0x000ffffffc000000UL : \ + is_uv(UV4) ? 0x00003ffffc000000UL : \ + is_uv(UV3) ? 0x00003ffffc000000UL : \ + is_uv(UV2) ? 0x00003ffffc000000UL : \ + 0) + +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT ( \ + is_uv(UV4) ? 26 : \ + is_uv(UV3) ? 26 : \ + is_uv(UV2) ? 26 : \ + -1) + +union uvh_rh_gam_mmr_overlay_config_u { unsigned long v; - struct uvh_rh_gam_mmr_overlay_config_mmr_s { + + /* UVH common struct */ + struct uvh_rh_gam_mmr_overlay_config_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s; - struct uvxh_rh_gam_mmr_overlay_config_mmr_s { + + /* UVXH common struct */ + struct uvxh_rh_gam_mmr_overlay_config_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } sx; - struct uv2h_rh_gam_mmr_overlay_config_mmr_s { + + /* UV4 unique struct */ + struct uv4h_rh_gam_mmr_overlay_config_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ - } s2; - struct uv3h_rh_gam_mmr_overlay_config_mmr_s { + } s4; + + /* UV3 unique struct */ + struct uv3h_rh_gam_mmr_overlay_config_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s3; - struct uv4h_rh_gam_mmr_overlay_config_mmr_s { + + /* UV2 unique struct */ + struct uv2h_rh_gam_mmr_overlay_config_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ - } s4; + } s2; }; /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UV2H_RTC 0x340000UL -#define UV3H_RTC 0x340000UL -#define UV4H_RTC 0xe0000UL #define UVH_RTC ( \ - is_uv2_hub() ? UV2H_RTC : \ - is_uv3_hub() ? UV3H_RTC : \ - /*is_uv4_hub*/ UV4H_RTC) + is_uv(UV5) ? 0xe0000UL : \ + is_uv(UV4) ? 0xe0000UL : \ + is_uv(UV3) ? 0x340000UL : \ + is_uv(UV2) ? 0x340000UL : \ + 0) +/* UVH common defines*/ #define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 #define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL union uvh_rtc_u { unsigned long v; + + /* UVH common struct */ struct uvh_rtc_s { unsigned long real_time_clock:56; /* RW */ unsigned long rsvd_56_63:8; } s; + + /* UV5 unique struct */ + struct uv5h_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s5; + + /* UV4 unique struct */ + struct uv4h_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s4; + + /* UV3 unique struct */ + struct uv3h_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s3; + + /* UV2 unique struct */ + struct uv2h_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s2; }; /* ========================================================================= */ @@ -3487,26 +4428,29 @@ union uvh_rtc_u { /* ========================================================================= */ #define UVH_RTC1_INT_CONFIG 0x615c0UL +/* UVH common defines*/ #define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 -#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 -#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 -#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 -#define UVH_RTC1_INT_CONFIG_P_SHFT 13 -#define UVH_RTC1_INT_CONFIG_T_SHFT 15 -#define UVH_RTC1_INT_CONFIG_M_SHFT 16 -#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 #define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 #define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 #define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 #define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC1_INT_CONFIG_P_SHFT 13 #define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC1_INT_CONFIG_T_SHFT 15 #define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC1_INT_CONFIG_M_SHFT 16 #define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 #define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_rtc1_int_config_u { unsigned long v; + + /* UVH common struct */ struct uvh_rtc1_int_config_s { unsigned long vector_:8; /* RW */ unsigned long dm:3; /* RW */ @@ -3519,591 +4463,175 @@ union uvh_rtc1_int_config_u { unsigned long rsvd_17_31:15; unsigned long apic_id:32; /* RW */ } s; -}; - -/* ========================================================================= */ -/* UVH_SCRATCH5 */ -/* ========================================================================= */ -#define UV2H_SCRATCH5 0x2d0200UL -#define UV3H_SCRATCH5 0x2d0200UL -#define UV4H_SCRATCH5 0xb0200UL -#define UVH_SCRATCH5 ( \ - is_uv2_hub() ? UV2H_SCRATCH5 : \ - is_uv3_hub() ? UV3H_SCRATCH5 : \ - /*is_uv4_hub*/ UV4H_SCRATCH5) - -#define UV2H_SCRATCH5_32 0x778 -#define UV3H_SCRATCH5_32 0x778 -#define UV4H_SCRATCH5_32 0x798 -#define UVH_SCRATCH5_32 ( \ - is_uv2_hub() ? UV2H_SCRATCH5_32 : \ - is_uv3_hub() ? UV3H_SCRATCH5_32 : \ - /*is_uv4_hub*/ UV4H_SCRATCH5_32) - -#define UVH_SCRATCH5_SCRATCH5_SHFT 0 -#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL - - -union uvh_scratch5_u { - unsigned long v; - struct uvh_scratch5_s { - unsigned long scratch5:64; /* RW, W1CS */ - } s; -}; -/* ========================================================================= */ -/* UVH_SCRATCH5_ALIAS */ -/* ========================================================================= */ -#define UV2H_SCRATCH5_ALIAS 0x2d0208UL -#define UV3H_SCRATCH5_ALIAS 0x2d0208UL -#define UV4H_SCRATCH5_ALIAS 0xb0208UL -#define UVH_SCRATCH5_ALIAS ( \ - is_uv2_hub() ? UV2H_SCRATCH5_ALIAS : \ - is_uv3_hub() ? UV3H_SCRATCH5_ALIAS : \ - /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS) - -#define UV2H_SCRATCH5_ALIAS_32 0x780 -#define UV3H_SCRATCH5_ALIAS_32 0x780 -#define UV4H_SCRATCH5_ALIAS_32 0x7a0 -#define UVH_SCRATCH5_ALIAS_32 ( \ - is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 : \ - is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 : \ - /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32) - - -/* ========================================================================= */ -/* UVH_SCRATCH5_ALIAS_2 */ -/* ========================================================================= */ -#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL -#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL -#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL -#define UVH_SCRATCH5_ALIAS_2 ( \ - is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 : \ - is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 : \ - /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2) -#define UVH_SCRATCH5_ALIAS_2_32 0x788 - - -/* ========================================================================= */ -/* UVXH_EVENT_OCCURRED2 */ -/* ========================================================================= */ -#define UVXH_EVENT_OCCURRED2 0x70100UL - -#define UV2H_EVENT_OCCURRED2_32 0xb68 -#define UV3H_EVENT_OCCURRED2_32 0xb68 -#define UV4H_EVENT_OCCURRED2_32 0x608 -#define UVH_EVENT_OCCURRED2_32 ( \ - is_uv2_hub() ? UV2H_EVENT_OCCURRED2_32 : \ - is_uv3_hub() ? UV3H_EVENT_OCCURRED2_32 : \ - /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_32) - - -#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL - -#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + /* UV5 unique struct */ + struct uv5h_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s5; -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15 -#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16 -#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17 -#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18 -#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19 -#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20 -#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21 -#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22 -#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23 -#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24 -#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25 -#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26 -#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27 -#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28 -#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29 -#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30 -#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31 -#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32 -#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33 -#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34 -#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35 -#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36 -#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37 -#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38 -#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39 -#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40 -#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41 -#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42 -#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43 -#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44 -#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45 -#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46 -#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47 -#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48 -#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49 -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL -#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL -#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL -#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL -#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL -#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL -#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL -#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL -#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL -#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL -#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL -#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL -#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL -#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL -#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL -#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL -#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL -#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL -#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL -#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL -#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL -#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL -#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL -#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL + /* UV4 unique struct */ + struct uv4h_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s4; -#define UVXH_EVENT_OCCURRED2_RTC_1_MASK ( \ - is_uv2_hub() ? UV2H_EVENT_OCCURRED2_RTC_1_MASK : \ - is_uv3_hub() ? UV3H_EVENT_OCCURRED2_RTC_1_MASK : \ - /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_RTC_1_MASK) + /* UV3 unique struct */ + struct uv3h_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s3; -union uvh_event_occurred2_u { - unsigned long v; - struct uv2h_event_occurred2_s { - unsigned long rtc_0:1; /* RW */ - unsigned long rtc_1:1; /* RW */ - unsigned long rtc_2:1; /* RW */ - unsigned long rtc_3:1; /* RW */ - unsigned long rtc_4:1; /* RW */ - unsigned long rtc_5:1; /* RW */ - unsigned long rtc_6:1; /* RW */ - unsigned long rtc_7:1; /* RW */ - unsigned long rtc_8:1; /* RW */ - unsigned long rtc_9:1; /* RW */ - unsigned long rtc_10:1; /* RW */ - unsigned long rtc_11:1; /* RW */ - unsigned long rtc_12:1; /* RW */ - unsigned long rtc_13:1; /* RW */ - unsigned long rtc_14:1; /* RW */ - unsigned long rtc_15:1; /* RW */ - unsigned long rtc_16:1; /* RW */ - unsigned long rtc_17:1; /* RW */ - unsigned long rtc_18:1; /* RW */ - unsigned long rtc_19:1; /* RW */ - unsigned long rtc_20:1; /* RW */ - unsigned long rtc_21:1; /* RW */ - unsigned long rtc_22:1; /* RW */ - unsigned long rtc_23:1; /* RW */ - unsigned long rtc_24:1; /* RW */ - unsigned long rtc_25:1; /* RW */ - unsigned long rtc_26:1; /* RW */ - unsigned long rtc_27:1; /* RW */ - unsigned long rtc_28:1; /* RW */ - unsigned long rtc_29:1; /* RW */ - unsigned long rtc_30:1; /* RW */ - unsigned long rtc_31:1; /* RW */ - unsigned long rsvd_32_63:32; + /* UV2 unique struct */ + struct uv2h_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ } s2; - struct uv3h_event_occurred2_s { - unsigned long rtc_0:1; /* RW */ - unsigned long rtc_1:1; /* RW */ - unsigned long rtc_2:1; /* RW */ - unsigned long rtc_3:1; /* RW */ - unsigned long rtc_4:1; /* RW */ - unsigned long rtc_5:1; /* RW */ - unsigned long rtc_6:1; /* RW */ - unsigned long rtc_7:1; /* RW */ - unsigned long rtc_8:1; /* RW */ - unsigned long rtc_9:1; /* RW */ - unsigned long rtc_10:1; /* RW */ - unsigned long rtc_11:1; /* RW */ - unsigned long rtc_12:1; /* RW */ - unsigned long rtc_13:1; /* RW */ - unsigned long rtc_14:1; /* RW */ - unsigned long rtc_15:1; /* RW */ - unsigned long rtc_16:1; /* RW */ - unsigned long rtc_17:1; /* RW */ - unsigned long rtc_18:1; /* RW */ - unsigned long rtc_19:1; /* RW */ - unsigned long rtc_20:1; /* RW */ - unsigned long rtc_21:1; /* RW */ - unsigned long rtc_22:1; /* RW */ - unsigned long rtc_23:1; /* RW */ - unsigned long rtc_24:1; /* RW */ - unsigned long rtc_25:1; /* RW */ - unsigned long rtc_26:1; /* RW */ - unsigned long rtc_27:1; /* RW */ - unsigned long rtc_28:1; /* RW */ - unsigned long rtc_29:1; /* RW */ - unsigned long rtc_30:1; /* RW */ - unsigned long rtc_31:1; /* RW */ - unsigned long rsvd_32_63:32; - } s3; - struct uv4h_event_occurred2_s { - unsigned long message_accelerator_int0:1; /* RW */ - unsigned long message_accelerator_int1:1; /* RW */ - unsigned long message_accelerator_int2:1; /* RW */ - unsigned long message_accelerator_int3:1; /* RW */ - unsigned long message_accelerator_int4:1; /* RW */ - unsigned long message_accelerator_int5:1; /* RW */ - unsigned long message_accelerator_int6:1; /* RW */ - unsigned long message_accelerator_int7:1; /* RW */ - unsigned long message_accelerator_int8:1; /* RW */ - unsigned long message_accelerator_int9:1; /* RW */ - unsigned long message_accelerator_int10:1; /* RW */ - unsigned long message_accelerator_int11:1; /* RW */ - unsigned long message_accelerator_int12:1; /* RW */ - unsigned long message_accelerator_int13:1; /* RW */ - unsigned long message_accelerator_int14:1; /* RW */ - unsigned long message_accelerator_int15:1; /* RW */ - unsigned long rtc_interval_int:1; /* RW */ - unsigned long bau_dashboard_int:1; /* RW */ - unsigned long rtc_0:1; /* RW */ - unsigned long rtc_1:1; /* RW */ - unsigned long rtc_2:1; /* RW */ - unsigned long rtc_3:1; /* RW */ - unsigned long rtc_4:1; /* RW */ - unsigned long rtc_5:1; /* RW */ - unsigned long rtc_6:1; /* RW */ - unsigned long rtc_7:1; /* RW */ - unsigned long rtc_8:1; /* RW */ - unsigned long rtc_9:1; /* RW */ - unsigned long rtc_10:1; /* RW */ - unsigned long rtc_11:1; /* RW */ - unsigned long rtc_12:1; /* RW */ - unsigned long rtc_13:1; /* RW */ - unsigned long rtc_14:1; /* RW */ - unsigned long rtc_15:1; /* RW */ - unsigned long rtc_16:1; /* RW */ - unsigned long rtc_17:1; /* RW */ - unsigned long rtc_18:1; /* RW */ - unsigned long rtc_19:1; /* RW */ - unsigned long rtc_20:1; /* RW */ - unsigned long rtc_21:1; /* RW */ - unsigned long rtc_22:1; /* RW */ - unsigned long rtc_23:1; /* RW */ - unsigned long rtc_24:1; /* RW */ - unsigned long rtc_25:1; /* RW */ - unsigned long rtc_26:1; /* RW */ - unsigned long rtc_27:1; /* RW */ - unsigned long rtc_28:1; /* RW */ - unsigned long rtc_29:1; /* RW */ - unsigned long rtc_30:1; /* RW */ - unsigned long rtc_31:1; /* RW */ - unsigned long rsvd_50_63:14; - } s4; }; /* ========================================================================= */ -/* UVXH_EVENT_OCCURRED2_ALIAS */ +/* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL - -#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 -#define UV3H_EVENT_OCCURRED2_ALIAS_32 0xb70 -#define UV4H_EVENT_OCCURRED2_ALIAS_32 0x610 -#define UVH_EVENT_OCCURRED2_ALIAS_32 ( \ - is_uv2_hub() ? UV2H_EVENT_OCCURRED2_ALIAS_32 : \ - is_uv3_hub() ? UV3H_EVENT_OCCURRED2_ALIAS_32 : \ - /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_ALIAS_32) +#define UVH_SCRATCH5 ( \ + is_uv(UV5) ? 0xb0200UL : \ + is_uv(UV4) ? 0xb0200UL : \ + is_uv(UV3) ? 0x2d0200UL : \ + is_uv(UV2) ? 0x2d0200UL : \ + 0) +#define UV5H_SCRATCH5 0xb0200UL +#define UV4H_SCRATCH5 0xb0200UL +#define UV3H_SCRATCH5 0x2d0200UL +#define UV2H_SCRATCH5 0x2d0200UL +/* UVH common defines*/ +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -/* ========================================================================= */ -/* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */ -/* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2 0xc8130UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2) +/* UVXH common defines */ +#define UVXH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVXH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0xa10 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2_32 ( \ - is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ - is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ - /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32) +/* UVYH common defines */ +#define UVYH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVYH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +/* UV5 unique defines */ +#define UV5H_SCRATCH5_SCRATCH5_SHFT 0 +#define UV5H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +/* UV4 unique defines */ +#define UV4H_SCRATCH5_SCRATCH5_SHFT 0 +#define UV4H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +/* UV3 unique defines */ +#define UV3H_SCRATCH5_SCRATCH5_SHFT 0 +#define UV3H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 -#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +/* UV2 unique defines */ +#define UV2H_SCRATCH5_SCRATCH5_SHFT 0 +#define UV2H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL -union uvxh_lb_bau_sb_activation_status_2_u { +union uvh_scratch5_u { unsigned long v; - struct uvxh_lb_bau_sb_activation_status_2_s { - unsigned long aux_error:64; /* RW */ - } sx; - struct uv2h_lb_bau_sb_activation_status_2_s { - unsigned long aux_error:64; /* RW */ - } s2; - struct uv3h_lb_bau_sb_activation_status_2_s { - unsigned long aux_error:64; /* RW */ - } s3; - struct uv4h_lb_bau_sb_activation_status_2_s { - unsigned long aux_error:64; /* RW */ - } s4; -}; -/* ========================================================================= */ -/* UV3H_GR0_GAM_GR_CONFIG */ -/* ========================================================================= */ -#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL - -#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0 -#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 -#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL -#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + /* UVH common struct */ + struct uvh_scratch5_s { + unsigned long scratch5:64; /* RW */ + } s; -union uv3h_gr0_gam_gr_config_u { - unsigned long v; - struct uv3h_gr0_gam_gr_config_s { - unsigned long m_skt:6; /* RW */ - unsigned long undef_6_9:4; /* Undefined */ - unsigned long subspace:1; /* RW */ - unsigned long reserved:53; - } s3; -}; + /* UVXH common struct */ + struct uvxh_scratch5_s { + unsigned long scratch5:64; /* RW */ + } sx; -/* ========================================================================= */ -/* UV4H_LB_PROC_INTD_QUEUE_FIRST */ -/* ========================================================================= */ -#define UV4H_LB_PROC_INTD_QUEUE_FIRST 0xa4100UL + /* UVYH common struct */ + struct uvyh_scratch5_s { + unsigned long scratch5:64; /* RW */ + } sy; -#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_SHFT 6 -#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffc0UL + /* UV5 unique struct */ + struct uv5h_scratch5_s { + unsigned long scratch5:64; /* RW */ + } s5; -union uv4h_lb_proc_intd_queue_first_u { - unsigned long v; - struct uv4h_lb_proc_intd_queue_first_s { - unsigned long undef_0_5:6; /* Undefined */ - unsigned long first_payload_address:40; /* RW */ + /* UV4 unique struct */ + struct uv4h_scratch5_s { + unsigned long scratch5:64; /* RW */ } s4; -}; -/* ========================================================================= */ -/* UV4H_LB_PROC_INTD_QUEUE_LAST */ -/* ========================================================================= */ -#define UV4H_LB_PROC_INTD_QUEUE_LAST 0xa4108UL - -#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_SHFT 5 -#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffe0UL + /* UV3 unique struct */ + struct uv3h_scratch5_s { + unsigned long scratch5:64; /* RW */ + } s3; -union uv4h_lb_proc_intd_queue_last_u { - unsigned long v; - struct uv4h_lb_proc_intd_queue_last_s { - unsigned long undef_0_4:5; /* Undefined */ - unsigned long last_payload_address:41; /* RW */ - } s4; + /* UV2 unique struct */ + struct uv2h_scratch5_s { + unsigned long scratch5:64; /* RW */ + } s2; }; /* ========================================================================= */ -/* UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR */ +/* UVH_SCRATCH5_ALIAS */ /* ========================================================================= */ -#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR 0xa4118UL +#define UVH_SCRATCH5_ALIAS ( \ + is_uv(UV5) ? 0xb0208UL : \ + is_uv(UV4) ? 0xb0208UL : \ + is_uv(UV3) ? 0x2d0208UL : \ + is_uv(UV2) ? 0x2d0208UL : \ + 0) +#define UV5H_SCRATCH5_ALIAS 0xb0208UL +#define UV4H_SCRATCH5_ALIAS 0xb0208UL +#define UV3H_SCRATCH5_ALIAS 0x2d0208UL +#define UV2H_SCRATCH5_ALIAS 0x2d0208UL -#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_SHFT 0 -#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_MASK 0x00000000000000ffUL - -union uv4h_lb_proc_intd_soft_ack_clear_u { - unsigned long v; - struct uv4h_lb_proc_intd_soft_ack_clear_s { - unsigned long soft_ack_pending_flags:8; /* WP */ - } s4; -}; /* ========================================================================= */ -/* UV4H_LB_PROC_INTD_SOFT_ACK_PENDING */ +/* UVH_SCRATCH5_ALIAS_2 */ /* ========================================================================= */ -#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING 0xa4110UL - -#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_SHFT 0 -#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_MASK 0x00000000000000ffUL +#define UVH_SCRATCH5_ALIAS_2 ( \ + is_uv(UV5) ? 0xb0210UL : \ + is_uv(UV4) ? 0xb0210UL : \ + is_uv(UV3) ? 0x2d0210UL : \ + is_uv(UV2) ? 0x2d0210UL : \ + 0) +#define UV5H_SCRATCH5_ALIAS_2 0xb0210UL +#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL +#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL -union uv4h_lb_proc_intd_soft_ack_pending_u { - unsigned long v; - struct uv4h_lb_proc_intd_soft_ack_pending_s { - unsigned long soft_ack_flags:8; /* RW */ - } s4; -}; #endif /* _ASM_X86_UV_UV_MMRS_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cd7de4b401fe..f8ba5289ecb0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -52,7 +52,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES) #define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT) #define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING) -#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP) +#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP) #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC) #define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID) #define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 6807153c0410..dde5b3f1e7cd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,29 +4,22 @@ #include <asm/bootparam.h> +struct ghcb; struct mpc_bus; struct mpc_cpu; +struct pt_regs; struct mpc_table; struct cpuinfo_x86; +struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops - * @mpc_record: platform specific mpc record accounting * @setup_ioapic_ids: platform specific ioapic id override - * @mpc_apic_id: platform specific mpc apic id assignment - * @smp_read_mpc_oem: platform specific oem mpc table setup - * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) - * @mpc_oem_bus_info: platform specific mpc bus info * @find_smp_config: find the smp configuration * @get_smp_config: get the smp configuration */ struct x86_init_mpparse { - void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); - int (*mpc_apic_id)(struct mpc_cpu *m); - void (*smp_read_mpc_oem)(struct mpc_table *mpc); - void (*mpc_oem_pci_bus)(struct mpc_bus *m); - void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*find_smp_config)(void); void (*get_smp_config)(unsigned int early); }; @@ -52,12 +45,14 @@ struct x86_init_resources { * @intr_init: interrupt init code * @intr_mode_select: interrupt delivery mode selection * @intr_mode_init: interrupt delivery mode setup + * @create_pci_msi_domain: Create the PCI/MSI interrupt domain */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); void (*intr_mode_select)(void); void (*intr_mode_init)(void); + struct irq_domain *(*create_pci_msi_domain)(void); }; /** @@ -236,10 +231,22 @@ struct x86_legacy_features { /** * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks * - * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) + * @pin_vcpu: pin current vcpu to specified physical + * cpu (run rarely) + * @sev_es_hcall_prepare: Load additional hypervisor-specific + * state into the GHCB when doing a VMMCALL under + * SEV-ES. Called from the #VC exception handler. + * @sev_es_hcall_finish: Copies state from the GHCB back into the + * processor (or pt_regs). Also runs checks on the + * state returned from the hypervisor after a + * VMMCALL under SEV-ES. Needs to return 'false' + * if the checks fail. Called from the #VC + * exception handler. */ struct x86_hyper_runtime { void (*pin_vcpu)(int cpu); + void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); + bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs); }; /** @@ -283,9 +290,6 @@ struct x86_platform_ops { struct pci_dev; struct x86_msi_ops { - int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*teardown_msi_irq)(unsigned int irq); - void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0780f97c1850..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,6 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { +#define KVM_MSR_FILTER_READ (1 << 0) +#define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 812e9b4c1114..950afebfba88 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -32,6 +32,7 @@ #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_ASYNC_PF_INT 14 +#define KVM_FEATURE_MSI_EXT_DEST_ID 15 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 2e8a30f06c74..f1d8307454e0 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -29,6 +29,7 @@ #define SVM_EXIT_WRITE_DR6 0x036 #define SVM_EXIT_WRITE_DR7 0x037 #define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_LAST_EXCP 0x05f #define SVM_EXIT_INTR 0x060 #define SVM_EXIT_NMI 0x061 #define SVM_EXIT_SMI 0x062 @@ -76,10 +77,21 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 +/* SEV-ES software-defined VMGEXIT events */ +#define SVM_VMGEXIT_MMIO_READ 0x80000001 +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 +#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff + #define SVM_EXIT_ERR -1 #define SVM_EXIT_REASONS \ @@ -171,6 +183,7 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77261db2391..68608bd892c0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev-es.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n +KASAN_SANITIZE_sev-es.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -45,6 +47,8 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n +CFLAGS_head$(BITS).o += -fno-stack-protector + CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o @@ -68,6 +72,7 @@ obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o +obj-y += static_call.o obj-y += process.o obj-y += fpu/ @@ -145,6 +150,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cdaab30880b9..4adbe65afe23 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1103,6 +1103,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs) */ goto out_put; + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->rel32); break; @@ -1277,6 +1281,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, switch (tp->opcode) { case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e89031e9c847..9ac696487b13 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/atomic.h> #include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <asm/mtrr.h> #include <asm/proto.h> #include <asm/iommu.h> @@ -96,8 +97,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, @@ -468,7 +468,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, { void *vaddr; - vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs); + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); if (!vaddr || !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) return vaddr; @@ -480,7 +480,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, goto out_free; return vaddr; out_free: - dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs); + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -490,7 +490,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); } static int no_agp; @@ -678,6 +678,8 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5f943b938167..b3eef1d5c903 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1429,6 +1429,9 @@ void __init apic_intr_mode_init(void) break; } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); + apic_bsp_setup(upmode); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 779a89e31c4c..7b3c7e0d4a09 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -860,10 +860,10 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info->ioapic_node = node; - info->ioapic_trigger = trigger; - info->ioapic_polarity = polarity; - info->ioapic_valid = 1; + info->ioapic.node = node; + info->ioapic.trigger = trigger; + info->ioapic.polarity = polarity; + info->ioapic.valid = 1; } #ifndef CONFIG_ACPI @@ -878,32 +878,32 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - dst->ioapic_id = mpc_ioapic_id(ioapic_idx); - dst->ioapic_pin = pin; - dst->ioapic_valid = 1; - if (src && src->ioapic_valid) { - dst->ioapic_node = src->ioapic_node; - dst->ioapic_trigger = src->ioapic_trigger; - dst->ioapic_polarity = src->ioapic_polarity; + dst->devid = mpc_ioapic_id(ioapic_idx); + dst->ioapic.pin = pin; + dst->ioapic.valid = 1; + if (src && src->ioapic.valid) { + dst->ioapic.node = src->ioapic.node; + dst->ioapic.trigger = src->ioapic.trigger; + dst->ioapic.polarity = src->ioapic.polarity; } else { - dst->ioapic_node = NUMA_NO_NODE; + dst->ioapic.node = NUMA_NO_NODE; if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic_trigger = trigger; - dst->ioapic_polarity = polarity; + dst->ioapic.trigger = trigger; + dst->ioapic.polarity = polarity; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = IOAPIC_LEVEL; - dst->ioapic_polarity = IOAPIC_POL_LOW; + dst->ioapic.trigger = IOAPIC_LEVEL; + dst->ioapic.polarity = IOAPIC_POL_LOW; } } } static int ioapic_alloc_attr_node(struct irq_alloc_info *info) { - return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; + return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } static void mp_register_handler(unsigned int irq, unsigned long trigger) @@ -933,14 +933,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic_trigger != data->trigger) - mp_register_handler(irq, info->ioapic_trigger); - data->entry.trigger = data->trigger = info->ioapic_trigger; - data->entry.polarity = data->polarity = info->ioapic_polarity; + if (info->ioapic.trigger != data->trigger) + mp_register_handler(irq, info->ioapic.trigger); + data->entry.trigger = data->trigger = info->ioapic.trigger; + data->entry.polarity = data->polarity = info->ioapic.polarity; } - return data->trigger == info->ioapic_trigger && - data->polarity == info->ioapic_polarity; + return data->trigger == info->ioapic.trigger && + data->polarity == info->ioapic.polarity; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -1002,7 +1002,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, if (!mp_check_pin_attr(irq, info)) return -EBUSY; if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic_pin)) + info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; @@ -2092,8 +2092,8 @@ static int mp_alloc_timer_irq(int ioapic, int pin) struct irq_alloc_info info; ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); - info.ioapic_id = mpc_ioapic_id(ioapic); - info.ioapic_pin = pin; + info.devid = mpc_ioapic_id(ioapic); + info.ioapic.pin = pin; mutex_lock(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); mutex_unlock(&ioapic_mutex); @@ -2243,6 +2243,7 @@ static inline void __init check_timer(void) legacy_pic->init(0); legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); unlock_ExtINT_logic(); @@ -2296,9 +2297,9 @@ static int mp_irqdomain_create(int ioapic) return 0; init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info.ioapic_id = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_ir_irq_domain(&info); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; + info.devid = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_irq_domain(&info); if (!parent) parent = x86_vector_domain; else @@ -2932,9 +2933,9 @@ int mp_ioapic_registered(u32 gsi_base) static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { - if (info && info->ioapic_valid) { - data->trigger = info->ioapic_trigger; - data->polarity = info->ioapic_polarity; + if (info && info->ioapic.valid) { + data->trigger = info->ioapic.trigger; + data->polarity = info->ioapic.polarity; } else if (acpi_get_override_irq(gsi, &data->trigger, &data->polarity) < 0) { /* PCI interrupts are always active low level triggered. */ @@ -2980,7 +2981,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; ioapic = mp_irqdomain_ioapic_idx(domain); - pin = info->ioapic_pin; + pin = info->ioapic.pin; if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) return -EEXIST; @@ -2988,7 +2989,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic_entry = &data->entry; + info->ioapic.entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -2996,7 +2997,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } INIT_LIST_HEAD(&data->irq_2_pin); - irq_data->hwirq = info->ioapic_pin; + irq_data->hwirq = info->ioapic.pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; @@ -3006,8 +3007,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); local_irq_save(flags); - if (info->ioapic_entry) - mp_setup_entry(cfg, data, info->ioapic_entry); + if (info->ioapic.entry) + mp_setup_entry(cfg, data, info->ioapic.entry); mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c2b2911feeef..6313f0a05db7 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -21,7 +21,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> -static struct irq_domain *msi_default_domain; +struct irq_domain *x86_pci_msi_default_domain __ro_after_init; static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) { @@ -45,7 +45,7 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { __irq_msi_compose_msg(irqd_cfg(data), msg); } @@ -177,40 +177,10 @@ static struct irq_chip pci_msi_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_set_affinity = msi_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - struct irq_domain *domain; - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_MSI; - info.msi_dev = dev; - - domain = irq_remapping_get_irq_domain(&info); - if (domain == NULL) - domain = msi_default_domain; - if (domain == NULL) - return -ENOSYS; - - return msi_domain_alloc_irqs(domain, &dev->dev, nvec); -} - -void native_teardown_msi_irq(unsigned int irq) -{ - irq_domain_free_irqs(irq, 1); -} - -static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->msi_hwirq; -} - int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { @@ -218,11 +188,10 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, struct msi_desc *desc = first_pci_msi_entry(pdev); init_irq_alloc_info(arg, NULL); - arg->msi_dev = pdev; if (desc->msi_attrib.is_msix) { - arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { - arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; } @@ -230,16 +199,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, } EXPORT_SYMBOL_GPL(pci_msi_prepare); -void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) -{ - arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); -} -EXPORT_SYMBOL_GPL(pci_msi_set_desc); - static struct msi_domain_ops pci_msi_domain_ops = { - .get_hwirq = pci_msi_get_hwirq, .msi_prepare = pci_msi_prepare, - .set_desc = pci_msi_set_desc, }; static struct msi_domain_info pci_msi_domain_info = { @@ -251,25 +212,32 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void __init arch_init_msi_domain(struct irq_domain *parent) +struct irq_domain * __init native_create_pci_msi_domain(void) { struct fwnode_handle *fn; + struct irq_domain *d; if (disable_apic) - return; + return NULL; fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (fn) { - msi_default_domain = - pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - parent); - } - if (!msi_default_domain) { + if (!fn) + return NULL; + + d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + x86_vector_domain); + if (!d) { irq_domain_free_fwnode(fn); - pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); } else { - msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } + return d; +} + +void __init x86_create_pci_msi_domain(void) +{ + x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } #ifdef CONFIG_IRQ_REMAP @@ -279,7 +247,6 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -321,35 +288,28 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->dmar_id; -} - static int dmar_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, - handle_edge_irq, arg->dmar_data, "edge"); + irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); return 0; } static struct msi_domain_ops dmar_msi_domain_ops = { - .get_hwirq = dmar_msi_get_hwirq, .msi_init = dmar_msi_init, }; static struct msi_domain_info dmar_msi_domain_info = { .ops = &dmar_msi_domain_ops, .chip = &dmar_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -384,8 +344,9 @@ int dmar_alloc_hwirq(int id, int node, void *arg) init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_DMAR; - info.dmar_id = id; - info.dmar_data = arg; + info.devid = id; + info.hwirq = id; + info.data = arg; return irq_domain_alloc_irqs(domain, 1, node, &info); } @@ -419,24 +380,17 @@ static struct irq_chip hpet_msi_controller __ro_after_init = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->hpet_index; -} - static int hpet_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, - handle_edge_irq, arg->hpet_data, "edge"); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); return 0; } @@ -448,7 +402,6 @@ static void hpet_msi_free(struct irq_domain *domain, } static struct msi_domain_ops hpet_msi_domain_ops = { - .get_hwirq = hpet_msi_get_hwirq, .msi_init = hpet_msi_init, .msi_free = hpet_msi_free, }; @@ -456,6 +409,7 @@ static struct msi_domain_ops hpet_msi_domain_ops = { static struct msi_domain_info hpet_msi_domain_info = { .ops = &hpet_msi_domain_ops, .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; struct irq_domain *hpet_create_irq_domain(int hpet_id) @@ -476,9 +430,9 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) domain_info->data = (void *)(long)hpet_id; init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_id = hpet_id; - parent = irq_remapping_get_ir_irq_domain(&info); + info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; + info.devid = hpet_id; + parent = irq_remapping_get_irq_domain(&info); if (parent == NULL) parent = x86_vector_domain; else @@ -506,9 +460,9 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_data = hc; - info.hpet_id = hpet_dev_id(domain); - info.hpet_index = dev_num; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99ee61c9ba54..67b6f7c049ec 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -170,9 +170,6 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bd3835d6b535..c46720f185c0 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -32,9 +32,6 @@ void __init default_setup_apic_routing(void) break; } } - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f8a56b5dc29f..1eac53632786 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -714,8 +714,6 @@ int __init arch_early_irq_init(void) BUG_ON(x86_vector_domain == NULL); irq_set_default_host(x86_vector_domain); - arch_init_msi_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); /* @@ -824,6 +822,7 @@ static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, .irq_retrigger = apic_retrigger_irq, }; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0b6eea3f54e6..714233cee0b5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,6 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/crash_dump.h> @@ -29,19 +30,24 @@ static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static union uvh_apicid uvh_apicid; +static int uv_node_id; -/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ +static u8 uv_archtype[UV_AT_SIZE]; static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; -/* Information derived from CPUID: */ +/* Information derived from CPUID and some UV MMRs */ static struct { unsigned int apicid_shift; unsigned int apicid_mask; unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ unsigned int pnode_mask; + unsigned int nasid_shift; unsigned int gpa_shift; unsigned int gnode_shift; + unsigned int m_skt; + unsigned int n_skt; } uv_cpuid; static int uv_min_hub_revision_id; @@ -77,6 +83,9 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { + if (!gru_start_paddr) + return false; + return start >= gru_start_paddr && end <= gru_end_paddr; } @@ -85,43 +94,102 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int __init early_get_pnodeid(void) +static void __init early_get_pnodeid(void) { - union uvh_node_id_u node_id; - union uvh_rh_gam_config_mmr_u m_n_config; int pnode; - /* Currently, all blades have same revision number */ + uv_cpuid.m_skt = 0; + if (UVH_RH10_GAM_ADDR_MAP_CONFIG) { + union uvh_rh10_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + uv_cpuid.nasid_shift = 0; + } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { + union uvh_rh_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + if (is_uv(UV3)) + uv_cpuid.m_skt = m_n_config.s3.m_skt; + if (is_uv(UV2)) + uv_cpuid.m_skt = m_n_config.s2.m_skt; + uv_cpuid.nasid_shift = 1; + } else { + unsigned long GAM_ADDR_MAP_CONFIG = 0; + + WARN(GAM_ADDR_MAP_CONFIG == 0, + "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n"); + uv_cpuid.n_skt = 0; + uv_cpuid.nasid_shift = 0; + } + + if (is_uv(UV4|UVY)) + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + + uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1; + pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* Default unless changed */ + + pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n", + uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode); +} + +/* Running on a UV Hubbed system, determine which UV Hub Type it is */ +static int __init early_set_hub_type(void) +{ + union uvh_node_id_u node_id; + + /* + * The NODE_ID MMR is always at offset 0. + * Contains the chip part # + revision. + * Node_id field started with 15 bits, + * ... now 7 but upper 8 are masked to 0. + * All blades/nodes have the same part # and hub revision. + */ node_id.v = uv_early_read_mmr(UVH_NODE_ID); - m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); - uv_min_hub_revision_id = node_id.s.revision; + uv_node_id = node_id.sx.node_id; switch (node_id.s.part_number) { - case UV2_HUB_PART_NUMBER: - case UV2_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + + case UV5_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV5_HUB_REVISION_BASE; + uv_hub_type_set(UV5); break; + + /* UV4/4A only have a revision difference */ + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV4_HUB_REVISION_BASE; + uv_hub_type_set(UV4); + if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) + uv_hub_type_set(UV4|UV4A); + break; + case UV3_HUB_PART_NUMBER: case UV3_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; + uv_min_hub_revision_id = node_id.s.revision + + UV3_HUB_REVISION_BASE; + uv_hub_type_set(UV3); break; - /* Update: UV4A has only a modified revision to indicate HUB fixes */ - case UV4_HUB_PART_NUMBER: - uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; - uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV2_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV2); break; + + default: + return 0; } - uv_hub_info->hub_revision = uv_min_hub_revision_id; - uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; - pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; - uv_cpuid.gpa_shift = 46; /* Default unless changed */ + pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n", + node_id.s.part_number, node_id.s.revision, + uv_min_hub_revision_id, is_uv(~0)); - pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", - node_id.s.revision, node_id.s.part_number, node_id.s.node_id, - m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode); - return pnode; + return 1; } static void __init uv_tsc_check_sync(void) @@ -130,38 +198,41 @@ static void __init uv_tsc_check_sync(void) int sync_state; int mmr_shift; char *state; - bool valid; - /* Accommodate different UV arch BIOSes */ + /* Different returns from different UV BIOS versions */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + /* Check if TSC is valid for all sockets */ switch (sync_state) { case UVH_TSC_SYNC_VALID: state = "in sync"; - valid = true; + mark_tsc_async_resets("UV BIOS"); break; - case UVH_TSC_SYNC_INVALID: - state = "unstable"; - valid = false; + /* If BIOS state unknown, don't do anything */ + case UVH_TSC_SYNC_UNKNOWN: + state = "unknown"; break; + + /* Otherwise, BIOS indicates problem with TSC */ default: - state = "unknown: assuming valid"; - valid = true; + state = "unstable"; + mark_tsc_unstable("UV BIOS"); break; } pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); - - /* Mark flag that says TSC != 0 is valid for socket 0 */ - if (valid) - mark_tsc_async_resets("UV BIOS"); - else - mark_tsc_unstable("UV BIOS"); } +/* Selector for (4|4A|5) structs */ +#define uvxy_field(sname, field, undef) ( \ + is_uv(UV4A) ? sname.s4a.field : \ + is_uv(UV4) ? sname.s4.field : \ + is_uv(UV3) ? sname.s3.field : \ + undef) + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -221,29 +292,110 @@ static void __init uv_stringify(int len, char *to, char *from) strncpy(to, from, len-1); } -static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +/* Find UV arch type entry in UVsystab */ +static unsigned long __init early_find_archtype(struct uv_systab *st) +{ + int i; + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + ptr += (unsigned long)st; + if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE) + return ptr; + } + return 0; +} + +/* Validate UV arch type field in UVsystab */ +static int __init decode_arch_type(unsigned long ptr) +{ + struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr; + int n = strlen(uv_ate->archtype); + + if (n > 0 && n < sizeof(uv_ate->archtype)) { + pr_info("UV: UVarchtype received from BIOS\n"); + uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype); + return 1; + } + return 0; +} + +/* Determine if UV arch type entry might exist in UVsystab */ +static int __init early_get_arch_type(void) { - int pnodeid; - int uv_apic; + unsigned long uvst_physaddr, uvst_size, ptr; + struct uv_systab *st; + u32 rev; + int ret; + + uvst_physaddr = get_uv_systab_phys(0); + if (!uvst_physaddr) + return 0; + + st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab)); + if (!st) { + pr_err("UV: Cannot access UVsystab, remap failed\n"); + return 0; + } + rev = st->revision; + if (rev < UV_SYSTAB_VERSION_UV5) { + early_memunmap(st, sizeof(struct uv_systab)); + return 0; + } + + uvst_size = st->size; + early_memunmap(st, sizeof(struct uv_systab)); + st = early_memremap_ro(uvst_physaddr, uvst_size); + if (!st) { + pr_err("UV: Cannot access UVarchtype, remap failed\n"); + return 0; + } + + ptr = early_find_archtype(st); + if (!ptr) { + early_memunmap(st, uvst_size); + return 0; + } + + ret = decode_arch_type(ptr); + early_memunmap(st, uvst_size); + return ret; +} + +static int __init uv_set_system_type(char *_oem_id) +{ + /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); - uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) != 0) + /* Check if BIOS sent us a UVarchtype */ + if (!early_get_arch_type()) + + /* If not use OEM ID for UVarchtype */ + uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id); + + /* Check if not hubbed */ + if (strncmp(uv_archtype, "SGI", 3) != 0) { + + /* (Not hubbed), check if not hubless */ + if (strncmp(uv_archtype, "NSGI", 4) != 0) + + /* (Not hubless), not a UV */ return 0; - /* UV4 Hubless, CH, (0x11:UV4+Any) */ - if (strncmp(oem_id, "NSGI4", 5) == 0) + /* UV4 Hubless: CH */ + if (strncmp(uv_archtype, "NSGI4", 5) == 0) uv_hubless_system = 0x11; - /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + /* UV3 Hubless: UV300/MC990X w/o hub */ else uv_hubless_system = 0x9; - pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", - oem_id, oem_table_id, uv_hubless_system); - + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", + oem_id, oem_table_id, uv_system_type, uv_hubless_system); return 0; } @@ -252,60 +404,83 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) return 0; } - /* Set up early hub type field in uv_hub_info for Node 0 */ - uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + /* Set hubbed type if true */ + uv_hub_info->hub_revision = + !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0; + + switch (uv_hub_info->hub_revision) { + case UV5_HUB_REVISION_BASE: + uv_hubbed_system = 0x21; + uv_hub_type_set(UV5); + break; - /* - * Determine UV arch type. - * SGI2: UV2000/3000 - * SGI3: UV300 (truncated to 4 chars because of different varieties) - * SGI4: UV400 (truncated to 4 chars because of different varieties) - */ - if (!strncmp(oem_id, "SGI4", 4)) { - uv_hub_info->hub_revision = UV4_HUB_REVISION_BASE; + case UV4_HUB_REVISION_BASE: uv_hubbed_system = 0x11; + uv_hub_type_set(UV4); + break; - } else if (!strncmp(oem_id, "SGI3", 4)) { - uv_hub_info->hub_revision = UV3_HUB_REVISION_BASE; + case UV3_HUB_REVISION_BASE: uv_hubbed_system = 0x9; + uv_hub_type_set(UV3); + break; - } else if (!strcmp(oem_id, "SGI2")) { - uv_hub_info->hub_revision = UV2_HUB_REVISION_BASE; + case UV2_HUB_REVISION_BASE: uv_hubbed_system = 0x5; + uv_hub_type_set(UV2); + break; - } else { - uv_hub_info->hub_revision = 0; - goto badbios; + default: + return 0; } - pnodeid = early_get_pnodeid(); - early_get_apic_socketid_shift(); + /* Get UV hub chip part number & revision */ + early_set_hub_type(); + /* Other UV setup functions */ + early_get_pnodeid(); + early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; + uv_tsc_check_sync(); + + return 1; +} + +/* Called early to probe for the correct APIC driver */ +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +{ + /* Set up early hub info fields for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + + /* If not UV, return. */ + if (likely(uv_set_system_type(_oem_id) == 0)) + return 0; + + /* Save and Decode OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - if (!strcmp(oem_table_id, "UVX")) { - /* This is the most common hardware variant: */ + /* This is the most common hardware variant, x2apic mode */ + if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; - uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVL")) { - /* Only used for very small systems: */ + /* Only used for very small systems, usually 1 chassis, legacy mode */ + else if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; - uv_apic = 0; - } else { + else goto badbios; - } - pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); - uv_tsc_check_sync(); + pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", + oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), + uv_min_hub_revision_id); - return uv_apic; + return 0; badbios: - pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); - pr_err("Current UV Type or BIOS not supported\n"); + pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); BUG(); } @@ -673,12 +848,12 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { }; #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 -#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT +#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; - union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + union uvh_rh_gam_alias_2_overlay_config_u alias; + union uvh_rh_gam_alias_2_redirect_config_u redirect; unsigned long m_redirect; unsigned long m_overlay; int i; @@ -686,16 +861,16 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { switch (i) { case 0: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG; break; case 1: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG; break; case 2: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG; break; } alias.v = uv_read_local_mmr(m_overlay); @@ -710,6 +885,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) } enum map_type {map_wb, map_uc}; +static const char * const mt[] = { "WB", "UC" }; static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) { @@ -721,23 +897,36 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift pr_info("UV: Map %s_HI base address NULL\n", id); return; } - pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); + + pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n", + id, paddr, paddr + bytes, mt[map_type], max_pnode + 1); } static __init void map_gru_high(int max_pnode) { - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK; - unsigned long base; + union uvh_rh_gam_gru_overlay_config_u gru; + unsigned long mask, base; + int shift; + + if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else { + pr_err("UV: GRU unavailable (no MMR)\n"); + return; + } - gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (!gru.s.enable) { - pr_info("UV: GRU disabled\n"); + pr_info("UV: GRU disabled (by BIOS)\n"); return; } @@ -749,62 +938,104 @@ static __init void map_gru_high(int max_pnode) static __init void map_mmr_high(int max_pnode) { - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long base; + int shift; + bool enable; + + if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh10_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else { + pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n", + __func__); + return; + } - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + if (enable) + map_high("MMR", base, shift, shift, max_pnode, map_uc); else pr_info("UV: MMR disabled\n"); } -/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ -static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) -{ - unsigned long overlay; - unsigned long mmr; - unsigned long base; - unsigned long nasid_mask; - unsigned long m_overlay; - int i, n, shift, m_io, max_io; - int nasid, lnasid, fi, li; - char *id; - - if (index == 0) { - id = "MMIOH0"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; - } else { - id = "MMIOH1"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; +/* Arch specific ENUM cases */ +enum mmioh_arch { + UV2_MMIOH = -1, + UVY_MMIOH0, UVY_MMIOH1, + UVX_MMIOH0, UVX_MMIOH1, +}; + +/* Calculate and Map MMIOH Regions */ +static void __init calc_mmioh_map(enum mmioh_arch index, + int min_pnode, int max_pnode, + int shift, unsigned long base, int m_io, int n_io) +{ + unsigned long mmr, nasid_mask; + int nasid, min_nasid, max_nasid, lnasid, mapped; + int i, fi, li, n, max_io; + char id[8]; + + /* One (UV2) mapping */ + if (index == UV2_MMIOH) { + strncpy(id, "MMIOH", sizeof(id)); + max_io = max_pnode; + mapped = 0; + goto map_exit; } - pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); - if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) { - pr_info("UV: %s disabled\n", id); + + /* small and large MMIOH mappings */ + switch (index) { + case UVY_MMIOH0: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVY_MMIOH1: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVX_MMIOH0: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + case UVX_MMIOH1: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + default: + pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index); return; } - /* Convert to NASID: */ - min_pnode *= 2; - max_pnode *= 2; - max_io = lnasid = fi = li = -1; + /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */ + snprintf(id, sizeof(id), "MMIOH%d", index%2); + max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { unsigned long m_redirect = mmr + i * 8; unsigned long redirect = uv_read_local_mmr(m_redirect); @@ -814,9 +1045,12 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", id, redirect, m_redirect, nasid); - /* Invalid NASID: */ - if (nasid < min_pnode || max_pnode < nasid) + /* Invalid NASID check */ + if (nasid < min_nasid || max_nasid < nasid) { + pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", + __func__, index, min_nasid, max_nasid); nasid = -1; + } if (nasid == lnasid) { li = i; @@ -839,7 +1073,8 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) } addr1 = (base << shift) + f * (1ULL << m_io); addr2 = (base << shift) + (l + 1) * (1ULL << m_io); - pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); if (max_io < l) max_io = l; } @@ -847,49 +1082,93 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) lnasid = nasid; } - pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io); +map_exit: + pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n", + id, base, shift, m_io, max_io, max_pnode); - if (max_io >= 0) + if (max_io >= 0 && !mapped) map_high(id, base, shift, m_io, max_io, map_uc); } static __init void map_mmioh_high(int min_pnode, int max_pnode) { - union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - unsigned long mmr, base; - int shift, enable, m_io, n_io; + /* UVY flavor */ + if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io); - if (is_uv3_hub() || is_uv4_hub()) { - /* Map both MMIOH regions: */ - map_mmioh_high_uv34(0, min_pnode, max_pnode); - map_mmioh_high_uv34(1, min_pnode, max_pnode); + mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io); return; } + /* UVX flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh0, base, 0); + int m_io = uvxy_field(mmioh0, m_io, 0); + int n_io = uvxy_field(mmioh0, n_io, 0); + + calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + base, m_io, n_io); + } - if (is_uv2_hub()) { - mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s2.enable; - base = mmioh.s2.base; - m_io = mmioh.s2.m_io; - n_io = mmioh.s2.n_io; - - if (enable) { - max_pnode &= (1 << n_io) - 1; - pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", - base, shift, m_io, n_io, max_pnode); - map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); - } else { - pr_info("UV: MMIOH disabled\n"); + mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh1, base, 0); + int m_io = uvxy_field(mmioh1, m_io, 0); + int n_io = uvxy_field(mmioh1, n_io, 0); + + calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + base, m_io, n_io); } + return; + } + + /* UV2 flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) { + union uvh_rh_gam_mmioh_overlay_config_u mmioh; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG); + if (unlikely(mmioh.s2.enable == 0)) + pr_info("UV: MMIOH disabled\n"); + else + calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode, + UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT, + mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io); + return; } } static __init void map_low_mmrs(void) { - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); + if (UV_GLOBAL_MMR32_BASE) + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + + if (UV_LOCAL_MMR_BASE) + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); } static __init void uv_rtc_init(void) @@ -909,85 +1188,6 @@ static __init void uv_rtc_init(void) } } -/* - * percpu heartbeat timer - */ -static void uv_heartbeat(struct timer_list *timer) -{ - unsigned char bits = uv_scir_info->state; - - /* Flip heartbeat bit: */ - bits ^= SCIR_CPU_HEARTBEAT; - - /* Is this CPU idle? */ - if (idle_cpu(raw_smp_processor_id())) - bits &= ~SCIR_CPU_ACTIVITY; - else - bits |= SCIR_CPU_ACTIVITY; - - /* Update system controller interface reg: */ - uv_set_scir_bits(bits); - - /* Enable next timer period: */ - mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); -} - -static int uv_heartbeat_enable(unsigned int cpu) -{ - while (!uv_cpu_scir_info(cpu)->enabled) { - struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; - - uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - timer_setup(timer, uv_heartbeat, TIMER_PINNED); - timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; - add_timer_on(timer, cpu); - uv_cpu_scir_info(cpu)->enabled = 1; - - /* Also ensure that boot CPU is enabled: */ - cpu = 0; - } - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static int uv_heartbeat_disable(unsigned int cpu) -{ - if (uv_cpu_scir_info(cpu)->enabled) { - uv_cpu_scir_info(cpu)->enabled = 0; - del_timer(&uv_cpu_scir_info(cpu)->timer); - } - uv_set_cpu_scir_bits(cpu, 0xff); - return 0; -} - -static __init void uv_scir_register_cpu_notifier(void) -{ - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", - uv_heartbeat_enable, uv_heartbeat_disable); -} - -#else /* !CONFIG_HOTPLUG_CPU */ - -static __init void uv_scir_register_cpu_notifier(void) -{ -} - -static __init int uv_init_heartbeat(void) -{ - int cpu; - - if (is_uv_system()) { - for_each_online_cpu(cpu) - uv_heartbeat_enable(cpu); - } - - return 0; -} - -late_initcall(uv_init_heartbeat); - -#endif /* !CONFIG_HOTPLUG_CPU */ - /* Direct Legacy VGA I/O traffic to designated IOH */ static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) { @@ -1027,26 +1227,22 @@ struct mn { unsigned char n_lshift; }; +/* Initialize caller's MN struct and fill in values */ static void get_mn(struct mn *mnp) { - union uvh_rh_gam_config_mmr_u m_n_config; - union uv3h_gr0_gam_gr_config_u m_gr_config; - - /* Make sure the whole structure is well initialized: */ memset(mnp, 0, sizeof(*mnp)); - - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); - mnp->n_val = m_n_config.s.n_skt; - - if (is_uv4_hub()) { + mnp->n_val = uv_cpuid.n_skt; + if (is_uv(UV4|UVY)) { mnp->m_val = 0; mnp->n_lshift = 0; } else if (is_uv3_hub()) { - mnp->m_val = m_n_config.s3.m_skt; - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + union uvyh_gr0_gam_gr_config_u m_gr_config; + + mnp->m_val = uv_cpuid.m_skt; + m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG); mnp->n_lshift = m_gr_config.s3.m_skt; } else if (is_uv2_hub()) { - mnp->m_val = m_n_config.s2.m_skt; + mnp->m_val = uv_cpuid.m_skt; mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; @@ -1054,7 +1250,6 @@ static void get_mn(struct mn *mnp) static void __init uv_init_hub_info(struct uv_hub_info_s *hi) { - union uvh_node_id_u node_id; struct mn mn; get_mn(&mn); @@ -1067,7 +1262,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->m_shift = mn.m_shift; hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; hi->hub_revision = uv_hub_info->hub_revision; + hi->hub_type = uv_hub_info->hub_type; hi->pnode_mask = uv_cpuid.pnode_mask; + hi->nasid_shift = uv_cpuid.nasid_shift; hi->min_pnode = _min_pnode; hi->min_socket = _min_socket; hi->pnode_to_socket = _pnode_to_socket; @@ -1076,9 +1273,8 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gr_table_len = _gr_table_len; hi->gr_table = _gr_table; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); - hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; if (mn.m_val) hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; @@ -1090,7 +1286,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gpa_shift = uv_gp_table->gpa_shift; hi->gpa_mask = (1UL << hi->gpa_shift) - 1; } else { - hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + hi->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) & + ~UV_MMR_ENABLE; hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; } @@ -1101,7 +1299,11 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) /* Show system specific info: */ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); - pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift); + if (hi->global_gru_base) + pr_info("UV: gru_base/shift:0x%lx/%ld\n", + hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); } @@ -1173,21 +1375,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); } +/* Walk through UVsystab decoding the fields */ static int __init decode_uv_systab(void) { struct uv_systab *st; int i; - /* If system is uv3 or lower, there is no extended UVsystab */ - if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) - return 0; /* No extended UVsystab required */ - + /* Get mapped UVsystab pointer */ st = uv_systab; + + /* If UVsystab is version 1, there is no extended UVsystab */ + if (st && st->revision == UV_SYSTAB_VERSION_1) + return 0; + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { int rev = st ? st->revision : 0; - pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST); - pr_err("UV: Cannot support UV operations, switching to generic PC\n"); + pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n", + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Does not support UV, switch to non-UV x86_64\n"); uv_system_type = UV_NONE; return -EINVAL; @@ -1199,7 +1405,8 @@ static int __init decode_uv_systab(void) if (!ptr) continue; - ptr = ptr + (unsigned long)st; + /* point to payload */ + ptr += (unsigned long)st; switch (st->entry[i].type) { case UV_SYSTAB_TYPE_GAM_PARAMS: @@ -1209,32 +1416,49 @@ static int __init decode_uv_systab(void) case UV_SYSTAB_TYPE_GAM_RNG_TBL: decode_gam_rng_tbl(ptr); break; + + case UV_SYSTAB_TYPE_ARCH_TYPE: + /* already processed in early startup */ + break; + + default: + pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n", + __func__, st->entry[i].type); + break; } } return 0; } -/* - * Set up physical blade translations from UVH_NODE_PRESENT_TABLE - * .. NB: UVH_NODE_PRESENT_TABLE is going away, - * .. being replaced by GAM Range Table - */ +/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) { + unsigned long np; int i, uv_pb = 0; - pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH); - for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { - unsigned long np; - - np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); - if (np) + if (UVH_NODE_PRESENT_TABLE) { + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", + UVH_NODE_PRESENT_TABLE_DEPTH); + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); - + uv_pb += hweight64(np); + } + } + if (UVH_NODE_PRESENT_0) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_0); + pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); + uv_pb += hweight64(np); + } + if (UVH_NODE_PRESENT_1) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_1); + pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); uv_pb += hweight64(np); } if (uv_possible_blades != uv_pb) uv_possible_blades = uv_pb; + + pr_info("UV: number nodes/possible blades %d\n", uv_pb); } static void __init build_socket_tables(void) @@ -1253,7 +1477,7 @@ static void __init build_socket_tables(void) pr_info("UV: No UVsystab socket table, ignoring\n"); return; } - pr_crit("UV: Error: UVsystab address translations not available!\n"); + pr_err("UV: Error: UVsystab address translations not available!\n"); BUG(); } @@ -1379,9 +1603,9 @@ static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) return 0; } -static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data) +static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) { - seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); return 0; } @@ -1390,7 +1614,7 @@ static __init void uv_setup_proc_files(int hubless) struct proc_dir_entry *pde; pde = proc_mkdir(UV_PROC_NODE, NULL); - proc_create_single("oemid", 0, pde, proc_oemid_show); + proc_create_single("archtype", 0, pde, proc_archtype_show); if (hubless) proc_create_single("hubless", 0, pde, proc_hubless_show); else @@ -1429,7 +1653,8 @@ static void __init uv_system_init_hub(void) struct uv_hub_info_s hub_info = {0}; int bytes, cpu, nodeid; unsigned short min_pnode = 9999, max_pnode = 0; - char *hub = is_uv4_hub() ? "UV400" : + char *hub = is_uv5_hub() ? "UV500" : + is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : is_uv2_hub() ? "UV2000/3000" : NULL; @@ -1441,12 +1666,14 @@ static void __init uv_system_init_hub(void) map_low_mmrs(); - /* Get uv_systab for decoding: */ + /* Get uv_systab for decoding, setup UV BIOS calls */ uv_bios_init(); /* If there's an UVsystab problem then abort UV init: */ - if (decode_uv_systab() < 0) + if (decode_uv_systab() < 0) { + pr_err("UV: Mangled UVsystab format\n"); return; + } build_socket_tables(); build_uv_gr_table(); @@ -1517,8 +1744,6 @@ static void __init uv_system_init_hub(void) uv_hub_info_list(numa_node_id)->pnode = pnode; else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; - - uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } for_each_node(nodeid) { @@ -1547,7 +1772,6 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); - uv_scir_register_cpu_notifier(); uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 3ca07ad552ae..70b7154f4bdd 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -38,9 +38,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); - - BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); BLANK(); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcc3d943c68f..6062ce586b95 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * If BIOS has not enabled SME then don't advertise the * SME feature (set in scattered.c). * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV feature (set in scattered.c). + * SEV and SEV_ES feature (set in scattered.c). * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -645,6 +645,7 @@ clear_all: setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); } } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c5cf336e5077..345f7d905db6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -65,6 +65,9 @@ static void init_c3(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } enum { @@ -90,18 +93,15 @@ enum { static void early_init_centaur(struct cpuinfo_x86 *c) { - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: - /* Emulate MTRRs using Centaur's MCR. */ + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); - break; #endif - case 6: - if (c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - break; - } + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif @@ -145,9 +145,8 @@ static void init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: + if (c->x86 == 5) { switch (c->x86_model) { case 4: name = "C6"; @@ -207,12 +206,10 @@ static void init_centaur(struct cpuinfo_x86 *c) c->x86_cache_size = (cc>>24)+(dd>>24); } sprintf(c->x86_model_id, "WinChip %s", name); - break; + } #endif - case 6: + if (c->x86 == 6 || c->x86 >= 7) init_c3(c); - break; - } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c5d6f17d9b9d..35ad8480c464 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -23,6 +23,7 @@ #include <linux/syscore_ops.h> #include <linux/pgtable.h> +#include <asm/cmdline.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -359,7 +360,7 @@ void native_write_cr0(unsigned long val) unsigned long bits_missing = 0; set_register: - asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { @@ -378,7 +379,7 @@ void native_write_cr4(unsigned long val) unsigned long bits_changed = 0; set_register: - asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { @@ -1221,6 +1222,59 @@ static void detect_nopl(void) } /* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg; + int arglen, res, bit; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + do { + res = get_option(&argptr, &bit); + if (res == 0 || res == 3) + break; + + /* If the argument was too long, the last bit may be cut off */ + if (res == 1 && arglen >= sizeof(arg)) + break; + + if (bit >= 0 && bit < NCAPINTS * 32) { + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + setup_clear_cpu_cap(bit); + } + } while (res == 2); + pr_cont("\n"); +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1255,6 +1309,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1413,15 +1468,7 @@ static void generic_identify(struct cpuinfo_x86 *c) * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT_XXL - do { - extern void native_iret(void); - if (pv_ops.cpu.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -# else set_cpu_bug(c, X86_BUG_ESPFIX); -# endif #endif } @@ -1829,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + /* Only mapped when SEV-ES is active */ + tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); } #else /* CONFIG_X86_64 */ @@ -1861,6 +1910,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) } /* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + int cpu = raw_smp_processor_id(); + + /* paranoid_entry() gets the CPU number from the GDT */ + setup_getcpu(cpu); + + /* IST vectors need TSS to be set up. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + load_TR_desc(); + + /* Finally load the IDT */ + load_current_idt(); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 9d033693519a..67944128876d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -38,7 +38,7 @@ struct _tlb_table { #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ - __attribute__((__section__(".x86_cpu_dev.init"))) = \ + __section(".x86_cpu_dev.init") = \ &cpu_devX; extern const struct cpu_dev *const __x86_cpu_dev_start[], diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 3cbe24ca80ab..d502241995a3 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 99be063fcb1b..0c6b02dd744c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -132,49 +132,49 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank) } static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* { bank_type, hwid_mcatype } */ /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, - { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, - { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, - { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, /* Microprocessor 5 Unit MCA type */ - { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, /* Northbridge IO Unit MCA type */ - { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, /* PCI Express Unit MCA type */ - { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f43a78bde670..4102b866e7c0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -40,7 +40,6 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> -#include <linux/jump_label.h> #include <linux/set_memory.h> #include <linux/sync_core.h> #include <linux/task_work.h> @@ -373,42 +372,105 @@ static int msr_to_offset(u32 msr) return -1; } +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + /* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) +static noinstr u64 mce_rdmsrl(u32 msr) { - u64 v; + DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + u64 ret; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - } + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; + instrumentation_end(); + + return ret; } - return v; + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); +} + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; } -static void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrl(u32 msr, u64 v) { + u32 low, high; + if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset >= 0) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + return; } - wrmsrl(msr, v); + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* @@ -745,7 +807,7 @@ log_it: goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -794,7 +856,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, quirk_no_way_out(i, m, regs); m->bank = i; - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -872,7 +934,6 @@ static void mce_reign(void) struct mce *m = NULL; int global_worst = 0; char *msg = NULL; - char *nmsg = NULL; /* * This CPU is the Monarch and the other CPUs have run @@ -880,12 +941,10 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; m = &per_cpu(mces_seen, cpu); } } @@ -895,8 +954,11 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); mce_panic("Fatal machine check", m, msg); + } /* * For UC somewhere we let the CPU who detects it handle it. @@ -1105,7 +1167,7 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct mce *final, +static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { @@ -1140,7 +1202,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* Set taint even when machine check was not enabled. */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(m, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, cfg->tolerant, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1188,13 +1250,34 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && + !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + sync_core(); return; } - pr_err("Memory error not recovered"); - kill_me_now(cb); + if (p->mce_vaddr != (void __user *)-1l) { + force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); + } else { + pr_err("Memory error not recovered"); + kill_me_now(cb); + } +} + +static void queue_task_work(struct mce *m, int kill_it) +{ + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_it) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } /* @@ -1291,7 +1374,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1313,7 +1396,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); + mce_severity(&m, regs, cfg->tolerant, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } @@ -1330,25 +1413,16 @@ noinstr void do_machine_check(struct pt_regs *regs) if (worst > 0) irq_work_queue(&mce_irq_work); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); - if (worst != MCE_AR_SEVERITY && !kill_it) - return; + goto out; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - current->mce_addr = m.addr; - current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(&m); - current->mce_kill_me.func = kill_me_maybe; - if (kill_it) - current->mce_kill_me.func = kill_me_now; - task_work_add(current, ¤t->mce_kill_me, true); + queue_task_work(&m, kill_it); + } else { /* * Handle an MCE which has happened in kernel space but from @@ -1363,7 +1437,12 @@ noinstr void do_machine_check(struct pt_regs *regs) if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, kill_it); } +out: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1904,6 +1983,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + bool irq_state; + WARN_ON_ONCE(user_mode(regs)); /* @@ -1914,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -1925,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) @@ -2062,7 +2143,7 @@ void mce_disable_bank(int bank) and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() + * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { @@ -2670,13 +2751,10 @@ static void __init mcheck_debugfs_init(void) static void __init mcheck_debugfs_init(void) { } #endif -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - static int __init mcheck_late_init(void) { if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); mcheck_debugfs_init(); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 03e51053592a..100fbeebdc72 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -67,7 +67,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); - mce->kflags |= MCE_HANDLED_MCELOG; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 6473070b5da4..88dcc79cfb07 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -38,7 +38,8 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -185,4 +186,14 @@ extern bool amd_filter_mce(struct mce *m); static inline bool amd_filter_mce(struct mce *m) { return false; }; #endif +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index e1da619add19..83df991314c5 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -9,9 +9,14 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/debugfs.h> -#include <asm/mce.h> #include <linux/uaccess.h> +#include <asm/mce.h> +#include <asm/intel-family.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> + #include "internal.h" /* @@ -40,9 +45,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -90,14 +100,9 @@ static struct severity { EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), - MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), - /* * known AO MCACODs reported via MCE or CMC: * @@ -113,6 +118,18 @@ static struct severity { AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), /* ignore OVER for UCNA */ MCESEV( @@ -198,6 +215,47 @@ static struct severity { #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + unsigned long addr; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_opcode(&insn); + if (!insn.opcode.got) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + insn_get_modrm(&insn); + insn_get_sib(&insn); + if (!insn.modrm.got || !insn.sib.got) + return false; + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -209,15 +267,25 @@ static struct severity { * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m) +static int error_context(struct mce *m, struct pt_regs *regs) { + enum handler_type t; + if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + t = ex_get_fault_handler_type(m->ip); + if (t == EX_HANDLER_FAULT) { m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } + if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { + m->kflags |= MCE_IN_KERNEL_RECOV; + m->kflags |= MCE_IN_KERNEL_COPYIN; + return IN_KERNEL_RECOV; + } return IN_KERNEL; } @@ -253,9 +321,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) @@ -305,10 +374,11 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { @@ -324,6 +394,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e continue; if (s->excp && excp != s->excp) continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; if (msg) *msg = s->msg; s->covered = 1; @@ -336,7 +412,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e } /* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = +int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = mce_severity_intel; void __init mcheck_vendor_init_severity(void) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 31125448b174..05ef1f4550cb 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -55,9 +55,14 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } -void hv_setup_vmbus_irq(void (*handler)(void)) +int hv_setup_vmbus_irq(int irq, void (*handler)(void)) { + /* + * The 'irq' argument is ignored on x86/x64 because a hard-coded + * interrupt vector is used for Hyper-V interrupts. + */ vmbus_handler = handler; + return 0; } void hv_remove_vmbus_irq(void) @@ -248,7 +253,7 @@ static void __init ms_hyperv_init_platform(void) hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); } - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; @@ -270,7 +275,7 @@ static void __init ms_hyperv_init_platform(void) crash_kexec_post_notifiers = true; #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. @@ -296,7 +301,7 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } else { @@ -330,7 +335,7 @@ static void __init ms_hyperv_init_platform(void) alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, asm_sysvec_hyperv_reenlightenment); } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6a9df71c1b9e..e5f4ee8f4c3b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -168,6 +168,7 @@ struct rdt_resource rdt_resources_all[] = { .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .cache_level = 3, + .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, }, @@ -254,22 +255,30 @@ static bool __get_mem_config_intel(struct rdt_resource *r) { union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, max_delay; cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->membw.max_delay = eax.split.max_delay + 1; + max_delay = eax.split.max_delay + 1; r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; - r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; - r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; + r->membw.arch_needs_linear = false; } r->data_width = 3; + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + r->alloc_capable = true; r->alloc_enabled = true; @@ -288,7 +297,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) /* AMD does not use delay */ r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; /* Max value is 2048, Data width should be 4 in decimal */ @@ -346,19 +361,6 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); } -static int get_cache_id(int cpu, int level) -{ - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - int i; - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == level) - return ci->info_list[i].id; - } - - return -1; -} - static void mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { @@ -556,13 +558,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -602,12 +604,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) static void domain_remove_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct rdt_domain *d; d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -918,12 +920,12 @@ static __init void rdt_init_res_defs_intel(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_intel; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_empty_bitmaps = false; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_THRTL_BASE; r->msr_update = mba_wrmsr_intel; - r->parse_ctrlval = parse_bw_intel; } } } @@ -938,12 +940,12 @@ static __init void rdt_init_res_defs_amd(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_amd; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_empty_bitmaps = true; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_BW_BASE; r->msr_update = mba_wrmsr_amd; - r->parse_ctrlval = parse_bw_amd; } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 934c8fb8a64a..c877642e8a14 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -23,53 +23,6 @@ /* * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and maximum bandwidth values specified by - * the hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate_amd(char *buf, unsigned long *data, - struct rdt_resource *r) -{ - unsigned long bw; - int ret; - - ret = kstrtoul(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); - return false; - } - - if (bw < r->membw.min_bw || bw > r->default_ctrl) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) -{ - unsigned long bw_val; - - if (d->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - if (!bw_validate_amd(data->buf, &bw_val, r)) - return -EINVAL; - - d->new_ctrl = bw_val; - d->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether MBA bandwidth percentage value is correct. The value is * checked against the minimum and max bandwidth values specified by the * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. @@ -82,7 +35,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) { + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } @@ -104,8 +57,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { unsigned long bw_val; @@ -123,12 +76,14 @@ int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, } /* - * Check whether a cache bit mask is valid. The SDM says: + * Check whether a cache bit mask is valid. + * For Intel the SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. */ -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -140,7 +95,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) return false; } - if (val == 0 || val > r->default_ctrl) { + if ((!r->cache.arch_has_empty_bitmaps && val == 0) || + val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -148,7 +104,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; } @@ -164,30 +122,6 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) } /* - * Check whether a cache bit mask is valid. AMD allows non-contiguous - * bitmasks - */ -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if (val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - *data = val; - return true; -} - -/* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ @@ -212,7 +146,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, return -EINVAL; } - if (!r->cbm_validate(data->buf, &cbm_val, r)) + if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5ffa32256b3b..80fa997fae60 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -283,7 +283,6 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it - * @chunks_bw Total local data moved. Used for bandwidth calculation * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting * @prev_bw The most recent bandwidth in MBps * @delta_bw Difference between the current and previous bandwidth @@ -292,7 +291,6 @@ struct rftype { struct mbm_state { u64 chunks; u64 prev_msr; - u64 chunks_bw; u64 prev_bw_msr; u32 prev_bw; u32 delta_bw; @@ -360,6 +358,8 @@ struct msr_param { * in a cache bit mask * @shareable_bits: Bitmask of shareable resource with other * executing entities + * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. + * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. */ struct rdt_cache { unsigned int cbm_len; @@ -367,25 +367,43 @@ struct rdt_cache { unsigned int cbm_idx_mult; unsigned int cbm_idx_offset; unsigned int shareable_bits; + bool arch_has_sparse_bitmaps; + bool arch_has_empty_bitmaps; +}; + +/** + * enum membw_throttle_mode - System's memory bandwidth throttling mode + * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system + * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core + * always using smallest bandwidth percentage + * assigned to threads, aka "max throttling" + * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread + */ +enum membw_throttle_mode { + THREAD_THROTTLE_UNDEFINED = 0, + THREAD_THROTTLE_MAX, + THREAD_THROTTLE_PER_THREAD, }; /** * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. * @min_bw: Minimum memory bandwidth percentage user can request * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Bandwidth throttling mode when threads request + * different memory bandwidths * @mba_sc: True if MBA software controller(mba_sc) is enabled * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - bool mba_sc; - u32 *mb_map; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + bool mba_sc; + u32 *mb_map; }; static inline bool is_llc_occupancy_enabled(void) @@ -437,7 +455,6 @@ struct rdt_parse_data { * @cache: Cache allocation related data * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values - * @cbm_validate Cache bitmask validate function * @evt_list: List of monitoring events * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes @@ -464,7 +481,6 @@ struct rdt_resource { int (*parse_ctrlval)(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); - bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r); struct list_head evt_list; int num_rmid; unsigned int mon_scale; @@ -474,10 +490,8 @@ struct rdt_resource { int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -609,8 +623,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 837d7d012b7b..54dffe574e67 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -279,8 +279,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - m->chunks_bw += chunks; - m->chunks = m->chunks_bw; + m->chunks += chunks; cur_bw = (chunks * r->mon_scale) >> 20; if (m->delta_comp) @@ -478,19 +477,13 @@ void cqm_handle_limbo(struct work_struct *work) mutex_lock(&rdtgroup_mutex); r = &rdt_resources_all[RDT_RESOURCE_L3]; - d = get_domain_from_cpu(cpu, r); - - if (!d) { - pr_warn_once("Failure to get domain for limbo worker\n"); - goto out_unlock; - } + d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); if (has_busy_rmid(r, d)) schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -520,10 +513,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3]; - - d = get_domain_from_cpu(cpu, r); - if (!d) - goto out_unlock; + d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp->mon.rmid); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 3f844f14fc0a..af323e2e3100 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -561,7 +561,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * callback has been invoked. */ atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, true); + ret = task_work_add(tsk, &callback->work, TWA_RESUME); if (ret) { /* * Task is exiting. Drop the refcount and free the callback. @@ -592,6 +592,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return ret; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /** * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group * @r: Resource group @@ -607,8 +619,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { ret = 1; break; } @@ -706,8 +717,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) + if (is_closid_match(t, r) || is_rmid_match(t, r)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -1017,6 +1027,19 @@ static int max_threshold_occ_show(struct kernfs_open_file *of, return 0; } +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); + + return 0; +} + static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1513,6 +1536,17 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, { .name = "max_threshold_occupancy", .mode = 0644, @@ -1583,7 +1617,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); for (rft = rfts; rft < rfts + len; rft++) { - if ((fflags & rft->fflags) == rft->fflags) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); if (ret) goto error; @@ -1600,6 +1634,33 @@ error: return ret; } +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -2245,18 +2306,6 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); -} - /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -3196,7 +3245,7 @@ int __init rdtgroup_init(void) * It may also be ok since that would enable debugging of RDT before * resctrl is mounted. * The reason why the debugfs directory is created here and not in - * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..866c9a9bcdee 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -35,12 +35,15 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, + { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 9b6fafa69be9..924571fe5864 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -33,6 +33,7 @@ #include <asm/timer.h> #include <asm/apic.h> #include <asm/vmware.h> +#include <asm/svm.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void) (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, + struct pt_regs *regs) +{ + /* Copy VMWARE specific Hypercall parameters to the GHCB */ + ghcb_set_rip(ghcb, regs->ip); + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); + ghcb_set_rdi(ghcb, regs->di); + ghcb_set_rbp(ghcb, regs->bp); +} + +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + if (!(ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb) && + ghcb_rsi_is_valid(ghcb) && + ghcb_rdi_is_valid(ghcb) && + ghcb_rbp_is_valid(ghcb))) + return false; + + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + regs->si = ghcb->save.rsi; + regs->di = ghcb->save.rdi; + regs->bp = ghcb->save.rbp; + + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_vmware = { - .name = "VMware", - .detect = vmware_platform, - .type = X86_HYPER_VMWARE, - .init.init_platform = vmware_platform_setup, - .init.x2apic_available = vmware_legacy_x2apic_available, + .name = "VMware", + .detect = vmware_platform, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish, +#endif }; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index a0e8fc7d85f1..ddffd80f5c52 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -229,8 +229,8 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, it = &of_ioapic_type[type_index]; ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); - tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); - tmp.ioapic_pin = fwspec->param[0]; + tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic.pin = fwspec->param[0]; return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 48ce44576947..25c06b67e7e0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -29,8 +29,8 @@ static int die_counter; static struct pt_regs exec_summary_regs; -bool in_task_stack(unsigned long *stack, struct task_struct *task, - struct stack_info *info) +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) { unsigned long *begin = task_stack_page(task); unsigned long *end = task_stack_page(task) + THREAD_SIZE; @@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } -bool in_entry_stack(unsigned long *stack, struct stack_info *info) +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) { struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); @@ -115,7 +116,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) unsigned long prologue = regs->ip - PROLOGUE_SIZE; if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { - printk("%sCode: Bad RIP value.\n", loglvl); + printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + loglvl, prologue); } else { printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 4a94d38cd141..1dd851397bd9 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = { [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", + [ ESTACK_VC ] = "#VC", + [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), + EPAGERANGE(VC), + EPAGERANGE(VC2), }; -static bool in_exception_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* @@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) return true; } -static bool in_irq_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); @@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) return true; } -int get_stack_info(unsigned long *stack, struct task_struct *task, - struct stack_info *info, unsigned long *visit_mask) +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info) { - if (!stack) - goto unknown; - - task = task ? : current; - if (in_task_stack(stack, task, info)) - goto recursion_check; + return true; if (task != current) - goto unknown; + return false; if (in_exception_stack(stack, info)) - goto recursion_check; + return true; if (in_irq_stack(stack, info)) - goto recursion_check; + return true; if (in_entry_stack(stack, info)) - goto recursion_check; + return true; + + return false; +} + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + task = task ? : current; - goto unknown; + if (!stack) + goto unknown; + + if (!get_stack_info_noinstr(stack, task, info)) + goto unknown; -recursion_check: /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 983cd53ed4c9..22aad412f965 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b) return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } +static bool e820_nomerge(enum e820_type type) +{ + /* + * These types may indicate distinct platform ranges aligned to + * numa node, protection domain, performance domain, or other + * boundaries. Do not merge them. + */ + if (type == E820_TYPE_PRAM) + return true; + if (type == E820_TYPE_SOFT_RESERVED) + return true; + return false; +} + int __init e820__update_table(struct e820_table *table) { struct e820_entry *entries = table->entries; @@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table) } /* Continue building up new map based on this information: */ - if (current_type != last_type || current_type == E820_TYPE_PRAM) { + if (current_type != last_type || e820_nomerge(current_type)) { if (last_type != 0) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 61ddc3a5e5c2..701f196d7c68 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -5,7 +5,6 @@ #include <asm/fpu/internal.h> #include <asm/tlbflush.h> #include <asm/setup.h> -#include <asm/cmdline.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -238,51 +237,11 @@ static void __init fpu__init_system_ctx_switch(void) } /* - * We parse fpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init fpu__init_parse_early_param(void) -{ - char arg[32]; - char *argptr = arg; - int bit; - -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option(boot_command_line, "clearcpuid", arg, - sizeof(arg)) && - get_option(&argptr, &bit) && - bit >= 0 && - bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); -} - -/* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ void __init fpu__init_system(struct cpuinfo_x86 *c) { - fpu__init_parse_early_param(); fpu__init_system_early_generic(c); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 038e19c0019e..5d8047441a0a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -37,6 +37,7 @@ static const char *xfeature_names[] = "AVX-512 ZMM_Hi256" , "Processor Trace (unused)" , "Protection Keys User registers", + "PASID state", "unknown xstate feature" , }; @@ -51,6 +52,7 @@ static short xsave_cpuid_features[] __initdata = { X86_FEATURE_AVX512F, X86_FEATURE_INTEL_PT, X86_FEATURE_PKU, + X86_FEATURE_ENQCMD, }; /* @@ -318,6 +320,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); + print_xstate_feature(XFEATURE_MASK_PASID); } /* @@ -592,6 +595,7 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); + XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); /* * Make *SURE* to add any feature numbers in below if @@ -601,7 +605,7 @@ static void check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -1398,3 +1402,60 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ + +#ifdef CONFIG_IOMMU_SUPPORT +void update_pasid(void) +{ + u64 pasid_state; + u32 pasid; + + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + return; + + if (!current->mm) + return; + + pasid = READ_ONCE(current->mm->pasid); + /* Set the valid bit in the PASID MSR/state only for valid pasid. */ + pasid_state = pasid == PASID_DISABLED ? + pasid : pasid | MSR_IA32_PASID_VALID; + + /* + * No need to hold fregs_lock() since the task's fpstate won't + * be changed by others (e.g. ptrace) while the task is being + * switched to or is in IPI. + */ + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + /* The MSR is active and can be directly updated. */ + wrmsrl(MSR_IA32_PASID, pasid_state); + } else { + struct fpu *fpu = ¤t->thread.fpu; + struct ia32_pasid_state *ppasid_state; + struct xregs_state *xsave; + + /* + * The CPU's xstate registers are not currently active. Just + * update the PASID state in the memory buffer here. The + * PASID MSR will be loaded when returning to user mode. + */ + xsave = &fpu->state.xsave; + xsave->header.xfeatures |= XFEATURE_MASK_PASID; + ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID); + /* + * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state + * won't be NULL and no need to check its value. + * + * Only update the task's PASID state when it's different + * from the mm's pasid. + */ + if (ppasid_state->pasid != pasid_state) { + /* + * Invalid fpregs so that state restoring will pick up + * the PASID state. + */ + __fpu_invalidate_fpregs_state(fpu); + ppasid_state->pasid = pasid_state; + } + } +} +#endif /* CONFIG_IOMMU_SUPPORT */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cbb71c1b574f..05e117137b45 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -36,6 +36,11 @@ #include <asm/microcode.h> #include <asm/kasan.h> #include <asm/fixmap.h> +#include <asm/realmode.h> +#include <asm/desc.h> +#include <asm/extable.h> +#include <asm/trapnr.h> +#include <asm/sev-es.h> /* * Manage page tables very early on. @@ -61,7 +66,25 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -#define __head __section(.head.text) +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { + .size = sizeof(startup_gdt), + .address = 0, +}; + +#define __head __section(".head.text") static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { @@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void) } /* Create a new PMD entry */ -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; @@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) - return -1; + return false; again: pgd_p = &early_top_pgt[pgd_index(address)].pgd; @@ -364,10 +387,10 @@ again: } pmd_p[pmd_index(address)] = pmd; - return 0; + return true; } -int __init early_make_pgtable(unsigned long address) +static bool __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pmdval_t pmd; @@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address) return __early_make_pgtable(address, pmd); } +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ + if (trapnr == X86_TRAP_PF && + early_make_pgtable(native_read_cr2())) + return; + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && + trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) + return; + + early_fixup_exception(regs, trapnr); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data) start_kernel(); } + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { + .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, + .address = 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + struct idt_data data; + gate_desc desc; + + init_idt_data(&data, n, handler); + idt_init_desc(&desc, &data); + native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ + struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); + gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + void *handler; + + /* VMM Communication Exception */ + handler = fixup_pointer(vc_no_ghcb, physbase); + set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + } + + desc->address = (unsigned long)idt; + native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ + /* VMM Communication Exception */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + + bringup_idt_descr.address = (unsigned long)bringup_idt_table; + native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ + /* Load GDT */ + startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 16da4ac01597..7eb2a1c87969 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu(), similar to initial_stack below */ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + leaq _text(%rip), %rdi + pushq %rsi + call startup_64_setup_env + popq %rsi + + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_EMPTY + /* Sanitize CPU configuration */ call verify_cpu @@ -112,6 +126,18 @@ SYM_CODE_START(secondary_startup_64) call verify_cpu /* + * The secondary_startup_64_no_verify entry point is only used by + * SEV-ES guests. In those guests the call to verify_cpu() would cause + * #VC exceptions which can not be handled at this stage of secondary + * CPU bringup. + * + * All non SEV-ES systems, especially Intel systems, need to execute + * verify_cpu() above to make sure NX is enabled. + */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ @@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64) 1: UNWIND_HINT_EMPTY - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_SCE, %eax /* Enable System Call */ - btl $20,%edi /* No Execute supported? */ - jnc 1f - btsl $_EFER_NX, %eax - btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ - movl $CR0_STATE, %eax - /* Make changes effective */ - movq %rax, %cr0 - - /* Setup a boot time stack */ - movq initial_stack(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace @@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack + */ + movq initial_stack(%rip), %rsp + + /* Setup and Load IDT */ + pushq %rsi + call early_setup_idt + popq %rsi + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) +1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi @@ -259,11 +293,47 @@ SYM_CODE_START(start_cpu0) SYM_CODE_END(start_cpu0) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + movq initial_vc_handler(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_boot_ghcb) +#endif + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 SYM_DATA(initial_code, .quad x86_64_start_kernel) SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) +#endif /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder @@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common) pushq %r15 /* pt_regs->r15 */ UNWIND_HINT_REGS - cmpq $14,%rsi /* Page fault? */ - jnz 10f - GET_CR2_INTO(%rdi) /* can clobber %rax if pv */ - call early_make_pgtable - andl %eax,%eax - jz 20f /* All good */ - -10: movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ - call early_fixup_exception + call do_early_exception -20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel SYM_CODE_END(early_idt_handler_common) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + call do_vc_no_ghcb + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_no_ghcb) +#endif #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index b98ff620ba77..03aa33b58165 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -442,42 +442,6 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, } /* - * Dump the debug register contents to the user. - * We can't dump our per cpu values because it - * may contain cpu wide breakpoint, something that - * doesn't belong to the current task. - * - * TODO: include non-ptrace user breakpoints (perf) - */ -void aout_dump_debugregs(struct user *dump) -{ - int i; - int dr7 = 0; - struct perf_event *bp; - struct arch_hw_breakpoint *info; - struct thread_struct *thread = ¤t->thread; - - for (i = 0; i < HBP_NUM; i++) { - bp = thread->ptrace_bps[i]; - - if (bp && !bp->attr.disabled) { - dump->u_debugreg[i] = bp->attr.bp_addr; - info = counter_arch_bp(bp); - dr7 |= encode_dr7(i, info->len, info->type); - } else { - dump->u_debugreg[i] = 0; - } - } - - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - - dump->u_debugreg[7] = dr7; -} -EXPORT_SYMBOL_GPL(aout_dump_debugregs); - -/* * Release the user breakpoints used by ptrace */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) @@ -490,7 +454,7 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) t->ptrace_bps[i] = NULL; } - t->debugreg6 = 0; + t->virtual_dr6 = 0; t->ptrace_dr7 = 0; } @@ -500,7 +464,7 @@ void hw_breakpoint_restore(void) set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); - set_debugreg(current->thread.debugreg6, 6); + set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); @@ -523,10 +487,10 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore); */ static int hw_breakpoint_handler(struct die_args *args) { - int i, cpu, rc = NOTIFY_STOP; + int i, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr6; unsigned long *dr6_p; + unsigned long dr6; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); @@ -540,14 +504,6 @@ static int hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - /* - * Assert that local interrupts are disabled - * Reset the DRn bits in the virtualized register value. - * The ptrace trigger routine will add in whatever is needed. - */ - current->thread.debugreg6 &= ~DR_TRAP_BITS; - cpu = get_cpu(); - /* Handle all the breakpoints that were triggered */ for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) @@ -561,7 +517,7 @@ static int hw_breakpoint_handler(struct die_args *args) */ rcu_read_lock(); - bp = per_cpu(bp_per_reg[i], cpu); + bp = this_cpu_read(bp_per_reg[i]); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -592,12 +548,10 @@ static int hw_breakpoint_handler(struct die_args *args) * breakpoints (to generate signals) and b) when the system has * taken exception due to multiple causes */ - if ((current->thread.debugreg6 & DR_TRAP_BITS) || + if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - put_cpu(); - return rc; } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 7ecf9babf0cb..ee1a283f8e96 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -11,13 +11,6 @@ #include <asm/desc.h> #include <asm/hw_irq.h> -struct idt_data { - unsigned int vector; - unsigned int segment; - struct idt_bits bits; - const void *addr; -}; - #define DPL0 0x0 #define DPL3 0x3 @@ -149,9 +142,6 @@ static const __initconst struct idt_data apic_idts[] = { # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -# ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message), -# endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif @@ -178,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address) } #endif -static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) -{ - unsigned long addr = (unsigned long) d->addr; - - gate->offset_low = (u16) addr; - gate->segment = (u16) d->segment; - gate->bits = d->bits; - gate->offset_middle = (u16) (addr >> 16); -#ifdef CONFIG_X86_64 - gate->offset_high = (u32) (addr >> 32); - gate->reserved = 0; -#endif -} - static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { @@ -209,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; - BUG_ON(n > 0xFF); - - memset(&data, 0, sizeof(data)); - data.vector = n; - data.addr = addr; - data.segment = __KERNEL_CS; - data.bits.type = GATE_INTERRUPT; - data.bits.p = 1; + init_idt_data(&data, n, addr); idt_setup_from_table(idt_table, &data, 1, false); } @@ -257,11 +226,14 @@ static const __initconst struct idt_data early_pf_idts[] = { * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 181060247e3c..c5dd50369e2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -227,7 +227,7 @@ static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) - run_on_irqstack_cond(desc->handle_irq, desc, regs); + run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); else __handle_irq(desc, regs); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1b4fe93a86c5..440eed558558 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,5 +74,5 @@ int irq_init_percpu_irqstack(unsigned int cpu) void do_softirq_own_stack(void) { - run_on_irqstack_cond(__do_softirq, NULL, NULL); + run_on_irqstack_cond(__do_softirq, NULL); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index c2f02f308ecf..ff7878df96b4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -629,9 +629,10 @@ static void kgdb_hw_overflow_handler(struct perf_event *event, struct task_struct *tsk = current; int i; - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { if (breakinfo[i].enabled) - tsk->thread.debugreg6 |= (DR_TRAP0 << i); + tsk->thread.virtual_dr6 |= (DR_TRAP0 << i); + } } void kgdb_arch_late(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index fdadc37d72af..547c7abb39f5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -38,9 +38,9 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> -#include <linux/frame.h> #include <linux/kasan.h> #include <linux/moduleloader.h> +#include <linux/objtool.h> #include <linux/vmalloc.h> #include <linux/pgtable.h> @@ -767,124 +767,21 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); + /* * Called from kretprobe_trampoline */ __used __visible void *trampoline_handler(struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - void *frame_pointer; - bool skipped = false; - - /* - * Set a dummy kprobe for avoiding kretprobe recursion. - * Since kretprobe never run in kprobe handler, kprobe must not - * be running at this point. - */ - kprobe_busy_begin(); - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif - /* We use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; - regs->ip = trampoline_address; + regs->ip = (unsigned long)&kretprobe_trampoline; regs->orig_ax = ~0UL; - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry(ri, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - /* - * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be popped - * correctly. However, if we find it is pushed it incorrect - * order, this means we find a function which should not be - * probed, because the wrong order entry is pushed on the - * path of processing other kretprobe itself. - */ - if (ri->fp != frame_pointer) { - if (!skipped) - pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); - skipped = true; - continue; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - if (skipped) - pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", - ri->rp->kp.addr); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - if (ri->fp != frame_pointer) - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, &kprobe_busy); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - kprobe_busy_end(); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; + return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, ®s->sp); } NOKPROBE_SYMBOL(trampoline_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 40f380461e6d..041f0b50bc27 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -16,8 +16,9 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -181,7 +182,6 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; @@ -210,7 +210,8 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || alternatives_text_reserved(src, src + len - 1) || - jump_label_text_reserved(src, src + len - 1)) + jump_label_text_reserved(src, src + len - 1) || + static_call_text_reserved(src, src + len - 1)) return -EBUSY; return len; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 08320b0b2b27..7f57ede3cb8e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,8 @@ #include <asm/hypervisor.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/ptrace.h> +#include <asm/svm.h> DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -270,9 +272,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); u32 token; - irqentry_state_t state; - state = irqentry_enter(regs); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); @@ -283,7 +284,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); } - irqentry_exit(regs, state); set_irq_regs(old_regs); } @@ -746,13 +746,34 @@ static void __init kvm_init_platform(void) x86_platform.apic_post_init = kvm_apic_init; } +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_kvm = { - .name = "KVM", - .detect = kvm_detect, - .type = X86_HYPER_KVM, - .init.guest_late_init = kvm_guest_init, - .init.x2apic_available = kvm_para_available, - .init.init_platform = kvm_init_platform, + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif }; static __init int activate_jump_labels(void) @@ -954,7 +975,7 @@ void arch_haltpoll_disable(unsigned int cpu) if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; - /* Enable guest halt poll disables host halt poll */ + /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index baa21090c9be..8f06449aab27 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -24,7 +24,6 @@ #include <asm/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> -#include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> #include <asm/e820/api.h> @@ -46,11 +45,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -int __init default_mpc_apic_id(struct mpc_cpu *m) -{ - return m->apicid; -} - static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -61,7 +55,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = x86_init.mpparse.mpc_apic_id(m); + apicid = m->apicid; if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -73,7 +67,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; @@ -84,7 +78,7 @@ static void __init MP_bus_info(struct mpc_bus *m) { char str[7]; - x86_init.mpparse.mpc_oem_bus_info(m, str); + mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -100,9 +94,6 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_init.mpparse.mpc_oem_pci_bus) - x86_init.mpparse.mpc_oem_pci_bus(m); - clear_bit(m->busid, mp_bus_not_pci); #ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_PCI; @@ -198,8 +189,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } -void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } - static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -218,14 +207,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr) - x86_init.mpparse.smp_read_mpc_oem(mpc); - - /* - * Now process the configuration blocks. - */ - x86_init.mpparse.mpc_record(0); - + /* Now process the configuration blocks. */ while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: @@ -256,7 +238,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - x86_init.mpparse.mpc_record(1); } if (!num_processors) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 49dcfb85e773..c0d409810658 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -80,18 +80,30 @@ static ssize_t msr_read(struct file *file, char __user *buf, static int filter_write(u32 reg) { + /* + * MSRs writes usually happen all at once, and can easily saturate kmsg. + * Only allow one message every 30 seconds. + * + * It's possible to be smarter here and do it (for example) per-MSR, but + * it would certainly be more complex, and this is enough at least to + * avoid saturating the ring buffer. + */ + static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1); + switch (allow_writes) { case MSR_WRITES_ON: return 0; case MSR_WRITES_OFF: return -EPERM; default: break; } + if (!__ratelimit(&fw_rs)) + return 0; + if (reg == MSR_IA32_ENERGY_PERF_BIAS) return 0; - pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n" - "Please report to x86@kernel.org\n", - reg, current->comm); + pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to x86@kernel.org.\n", + reg, current->comm, current->pid); return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4fc9954a9560..4bc77aaf1303 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,6 +33,7 @@ #include <asm/reboot.h> #include <asm/cache.h> #include <asm/nospec-branch.h> +#include <asm/sev-es.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -102,7 +103,6 @@ fs_initcall(nmi_warning_debugfs); static void nmi_check_duration(struct nmiaction *action, u64 duration) { - u64 whole_msecs = READ_ONCE(action->max_duration); int remainder_ns, decimal_msecs; if (duration < nmi_longest_ns || duration < action->max_duration) @@ -110,12 +110,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(whole_msecs, (1000 * 1000)); + remainder_ns = do_div(duration, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; printk_ratelimited(KERN_INFO "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, whole_msecs, decimal_msecs); + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) @@ -477,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) { bool irq_state; + /* + * Re-enable NMIs right here when running as an SEV-ES guest. This might + * cause nested NMIs, but those can be handled safely. + */ + sev_es_nmi_complete(); + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -488,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: + /* + * Needs to happen before DR7 is accessed, because the hypervisor can + * intercept DR7 reads/writes, turning those into #VC exceptions. + */ + sev_es_ist_enter(regs); + this_cpu_write(nmi_dr7, local_db_save()); irq_state = idtentry_enter_nmi(regs); @@ -501,6 +513,8 @@ nmi_restart: local_db_restore(this_cpu_read(nmi_dr7)); + sev_es_ist_exit(); + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index de2138ba38e5..6c3407ba6ee9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -263,13 +263,8 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL - .kernel_rpl = 0, - .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ - -#ifdef CONFIG_X86_64 .extra_user_64bit_cs = __USER_CS, #endif -#endif }; /* 64-bit pagetable entries */ @@ -305,9 +300,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, -#ifdef CONFIG_X86_64 .cpu.load_gs_index = native_load_gs_index, -#endif .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, @@ -317,9 +310,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, -#ifdef CONFIG_X86_64 .cpu.usergs_sysret64 = native_usergs_sysret64, -#endif .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, @@ -369,24 +360,16 @@ struct paravirt_patch_template pv_ops = { .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, - .mmu.set_pte_at = native_set_pte_at, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if CONFIG_PGTABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - .mmu.set_pte_atomic = native_set_pte_atomic, - .mmu.pte_clear = native_pte_clear, - .mmu.pmd_clear = native_pmd_clear, -#endif .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS >= 4 .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, @@ -398,8 +381,6 @@ struct paravirt_patch_template pv_ops = { .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index 3eff63c090d2..ace6e334cb39 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -26,14 +26,10 @@ struct patch_xxl { const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; const unsigned char irq_restore_fl[2]; -# ifdef CONFIG_X86_64 const unsigned char cpu_wbinvd[2]; const unsigned char cpu_usergs_sysret64[6]; const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; -# else - const unsigned char cpu_iret[1]; -# endif }; static const struct patch_xxl patch_data_xxl = { @@ -42,7 +38,6 @@ static const struct patch_xxl patch_data_xxl = { .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax -# ifdef CONFIG_X86_64 .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd @@ -50,19 +45,11 @@ static const struct patch_xxl patch_data_xxl = { 0x48, 0x0f, 0x07 }, // swapgs; sysretq .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -# else - .mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3 - .irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf - .cpu_iret = { 0xcf }, // iret -# endif }; unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) { -#ifdef CONFIG_X86_64 return PATCH(xxl, mov64, insn_buff, len); -#endif - return 0; } # endif /* CONFIG_PARAVIRT_XXL */ @@ -98,13 +85,9 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); -# ifdef CONFIG_X86_64 PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -# else - PATCH_CASE(cpu, iret, xxl, insn_buff, len); -# endif #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5dcedad21dff..de234e7a8962 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> -#include <linux/dma-debug.h> #include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 13ce616cc7af..ba4593a913fa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -42,6 +42,7 @@ #include <asm/spec-ctrl.h> #include <asm/io_bitmap.h> #include <asm/proto.h> +#include <asm/frame.h> #include "process.h" @@ -133,7 +134,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; - frame->bp = 0; + frame->bp = encode_frame_pointer(childregs); frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9afefe325acb..df342bedea88 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -407,7 +407,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); @@ -422,7 +422,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); @@ -439,7 +439,7 @@ unsigned long x86_fsbase_read_task(struct task_struct *task) if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else @@ -454,7 +454,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e7537c5440bb..bedca011459c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -465,7 +465,7 @@ static void ptrace_triggered(struct perf_event *bp, break; } - thread->debugreg6 |= (DR_TRAP0 << i); + thread->virtual_dr6 |= (DR_TRAP0 << i); } /* @@ -601,7 +601,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (bp) val = bp->hw.info.address; } else if (n == 6) { - val = thread->debugreg6; + val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } @@ -657,7 +657,7 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { - thread->debugreg6 = val; + thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b10717c9321..6d0df6a58873 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,6 +8,7 @@ #include <asm/hpet.h> #include <asm/setup.h> +#include <asm/mce.h> #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) @@ -624,10 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) -#include <linux/jump_label.h> -#include <asm/string_64.h> - /* Ivy Bridge, Haswell, Broadwell */ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) { @@ -636,7 +633,7 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) pci_read_config_dword(pdev, 0x84, &capid0); if (capid0 & 0x10) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } /* Skylake */ @@ -653,7 +650,7 @@ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) * enabled, so memory machine check recovery is also enabled. */ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); @@ -661,7 +658,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif -#endif bool x86_apple_machine; EXPORT_SYMBOL(x86_apple_machine); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a515e2d230b7..db115943e8bd 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -10,7 +10,7 @@ #include <linux/sched.h> #include <linux/tboot.h> #include <linux/delay.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/pgtable.h> #include <acpi/reboot.h> #include <asm/io.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3511736fbc74..84f581c91db4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -7,6 +7,7 @@ */ #include <linux/console.h> #include <linux/crash_dump.h> +#include <linux/dma-map-ops.h> #include <linux/dmi.h> #include <linux/efi.h> #include <linux/init_ohci1394_dma.h> @@ -19,6 +20,8 @@ #include <linux/hugetlb.h> #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> +#include <linux/static_call.h> +#include <linux/swiotlb.h> #include <uapi/linux/mount.h> @@ -263,16 +266,12 @@ static void __init relocate_initrd(void) u64 area_size = PAGE_ALIGN(ramdisk_size); /* We need to move the initrd down into directly mapped mem */ - relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - area_size, PAGE_SIZE); - + relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the mem currently occupied by - the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(relocated_ramdisk, area_size); initrd_start = relocated_ramdisk + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", @@ -299,13 +298,13 @@ static void __init early_reserve_initrd(void) memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } + static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -313,12 +312,6 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = memblock_mem_size(max_pfn_mapped); - if (ramdisk_size >= (mapped_size>>1)) - panic("initrd too large to handle, " - "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, mapped_size>>1); - printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); @@ -430,13 +423,13 @@ static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 unsigned long long base, low_base = 0, low_size = 0; - unsigned long total_low_mem; + unsigned long low_mem_limit; int ret; - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); if (ret) { /* * two parts from kernel/dma/swiotlb.c: @@ -454,23 +447,17 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); return -ENOMEM; } - ret = memblock_reserve(low_base, low_size); - if (ret) { - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); - return ret; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", (unsigned long)(low_size >> 20), (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); + (unsigned long)(low_mem_limit >> 20)); crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; @@ -514,13 +501,13 @@ static void __init reserve_crashkernel(void) * unless "crashkernel=size[KMG],high" is specified. */ if (!high) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_LOW_MAX); if (!crash_base) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -528,19 +515,13 @@ static void __init reserve_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, 1 << 20); + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, + crash_base + crash_size); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret) { - pr_err("%s: Error reserving crashkernel memblock.\n", __func__); - return; - } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { memblock_free(crash_base, crash_size); @@ -849,6 +830,7 @@ void __init setup_arch(char **cmdline_p) early_cpu_init(); arch_init_ideal_nops(); jump_label_init(); + static_call_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1077,6 +1059,7 @@ void __init setup_arch(char **cmdline_p) efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); + efi_mokvar_table_init(); /* * The EFI specification says that boot service code won't be @@ -1218,6 +1201,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); init_cpu_to_node(); + init_gi_nodes(); io_apic_init_mappings(); diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c new file mode 100644 index 000000000000..5f83ccaab877 --- /dev/null +++ b/arch/x86/kernel/sev-es-shared.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void sev_es_terminate(unsigned int reason) +{ + u64 val = GHCB_SEV_TERMINATE; + + /* + * Tell the hypervisor what went wrong - only reason-set 0 is + * currently supported. + */ + val |= GHCB_SEV_TERMINATE_REASON(0, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_INFO(val) != GHCB_SEV_INFO) + return false; + + if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + return false; + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + enum es_result ret; + + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v; + + info = ghcb->save.sw_exit_info_2; + v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + ret = ES_EXCEPTION; + } else { + ret = ES_VMM_ERROR; + } + } else { + ret = ES_OK; + } + + return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int fn = lower_bits(regs->ax, 32); + unsigned long val; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->ax = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->bx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->cx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->dx = val >> 32; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); + VMGEXIT(); + + /* Shouldn't get here - if we do halt the machine */ + while (true) + asm volatile("hlt\n"); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (u64)insn->immediate.value << 16; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (u64)insn->immediate.value << 16; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + default: + return ES_DECODE_FAILED; + } + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + } + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c new file mode 100644 index 000000000000..4a96726fbaf8 --- /dev/null +++ b/arch/x86/kernel/sev-es.c @@ -0,0 +1,1404 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV-ES: " fmt + +#include <linux/sched/debug.h> /* For show_regs() */ +#include <linux/percpu-defs.h> +#include <linux/mem_encrypt.h> +#include <linux/lockdep.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev-es.h> +#include <asm/insn-eval.h> +#include <asm/fpu/internal.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> + +#define DR7_RESET_VALUE 0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* Physical storage for the per-CPU IST stack of the #VC handler */ + char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Physical storage for the per-CPU fall-back stack of the #VC handler. + * The fall-back stack is used when it is not safe to switch back to the + * interrupted stack in the #VC entry code. + */ + char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static void __init setup_vc_stacks(int cpu) +{ + struct sev_es_runtime_data *data; + struct cpu_entry_area *cea; + unsigned long vaddr; + phys_addr_t pa; + + data = per_cpu(runtime_data, cpu); + cea = get_cpu_entry_area(cpu); + + /* Map #VC IST stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); + pa = __pa(data->ist_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + + /* Map VC fall-back stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); + pa = __pa(data->fallback_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); +} + +static __always_inline bool on_vc_stack(unsigned long sp) +{ + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC exception + * handler entry code. In this case, the IST entry for #VC must be adjusted, so + * that any subsequent #VC exception will not overwrite the stack contents of the + * interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested + * sev_es_ist_exit() call may adjust back the IST entry too early. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ + if (on_vc_stack(regs->sp)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); + + /* Store old IST entry */ + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) + return NULL; + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + data->ghcb_active = false; + } +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + int res; + + if (user_mode(ctxt->regs)) { + res = insn_fetch_from_user(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; + } else { + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_get_length(&ctxt->insn); + } + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + char __user *target = (char __user *)dst; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + switch (size) { + case 1: + memcpy(&d1, buf, 1); + if (put_user(d1, target)) + goto fault; + break; + case 2: + memcpy(&d2, buf, 2); + if (put_user(d2, target)) + goto fault; + break; + case 4: + memcpy(&d4, buf, 4); + if (put_user(d4, target)) + goto fault; + break; + case 8: + memcpy(&d8, buf, 8); + if (put_user(d8, target)) + goto fault; + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + char __user *s = (char __user *)src; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + switch (size) { + case 1: + if (get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + case 2: + if (get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + case 4: + if (get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + case 8: + if (get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return false; + } + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return true; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-es-shared.c" + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + sev_es_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + local_irq_save(flags); + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + sev_es_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!sev_es_active()) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + return false; + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + sev_es_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!sev_es_active()) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + setup_vc_stacks(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { + if (!read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + int sign_byte; + long *reg_data; + + switch (insn->opcode.bytes[1]) { + /* MMIO Read w/ zero-extension */ + case 0xb6: + bytes = 1; + fallthrough; + case 0xb7: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memset(reg_data, 0, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MMIO Read w/ sign-extension */ + case 0xbe: + bytes = 1; + fallthrough; + case 0xbf: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Sign extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + memset(reg_data, sign_byte, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + long *reg_data; + + switch (insn->opcode.bytes[0]) { + /* MMIO Write */ + case 0x88: + bytes = 1; + fallthrough; + case 0x89: + if (!bytes) + bytes = insn->opnd_bytes; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memcpy(ghcb->shared_buffer, reg_data, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + case 0xc6: + bytes = 1; + fallthrough; + case 0xc7: + if (!bytes) + bytes = insn->opnd_bytes; + + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + /* MMIO Read */ + case 0x8a: + bytes = 1; + fallthrough; + case 0x8b: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MOVS instruction */ + case 0xa4: + bytes = 1; + fallthrough; + case 0xa5: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + /* Two-Byte Opcodes */ + case 0x0f: + ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + break; + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static __always_inline void vc_handle_trap_db(struct pt_regs *regs) +{ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ + unsigned long sp = (unsigned long)regs; + + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +/* + * Main #VC exception handler. It is called when the entry code was able to + * switch off the IST to a safe kernel stack. + * + * With the current implementation it is always possible to switch to a safe + * stack because #VC exceptions only happen at known places, like intercepted + * instructions or accesses to MMIO areas/IO ports. They can also happen with + * code instrumentation when the hypervisor intercepts #DB, but the critical + * paths are forbidden to be instrumented, so #DB exceptions currently also + * only happen in safe places. + */ +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + + lockdep_assert_irqs_disabled(); + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { + vc_handle_trap_db(regs); + return; + } + + instrumentation_begin(); + + /* + * This is invoked through an interrupt gate, so IRQs are disabled. The + * code below might walk page-tables for user or kernel addresses, so + * keep the IRQs disabled to protect us against concurrent TLB flushes. + */ + + ghcb = sev_es_get_ghcb(&state); + if (!ghcb) { + /* + * Mark GHCBs inactive so that panic() is able to print the + * message. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + sev_es_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + +out: + instrumentation_end(); + + return; + +fail: + if (user_mode(regs)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } else { + pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", + result); + + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + goto out; +} + +/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ +DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +{ + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); +} + +DEFINE_IDTENTRY_VC(exc_vmm_communication) +{ + if (likely(!on_vc_fallback_stack(regs))) + safe_stack_exc_vmm_communication(regs, error_code); + else + ist_exc_vmm_communication(regs, error_code); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + /* Do initial setup or terminate the guest */ + if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + while (true) + halt(); +} diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 9ccbf0576cd0..a7f3e12cfbdb 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void) */ BUILD_BUG_ON(NSIGILL != 11); BUILD_BUG_ON(NSIGFPE != 15); - BUILD_BUG_ON(NSIGSEGV != 7); + BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f5ef689dd62a..de776b2e6046 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - load_current_idt(); + cpu_init_exception_handling(); cpu_init(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2fd698e28e4d..8627fda8d993 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -18,13 +18,13 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct unwind_state state; unsigned long addr; - if (regs && !consume_entry(cookie, regs->ip, false)) + if (regs && !consume_entry(cookie, regs->ip)) return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } @@ -72,7 +72,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } @@ -114,7 +114,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, { const void __user *fp = (const void __user *)regs->bp; - if (!consume_entry(cookie, regs->ip, false)) + if (!consume_entry(cookie, regs->ip)) return; while (1) { @@ -128,7 +128,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, break; if (!frame.ret_addr) break; - if (!consume_entry(cookie, frame.ret_addr, false)) + if (!consume_entry(cookie, frame.ret_addr)) break; fp = frame.next_fp; } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c new file mode 100644 index 000000000000..ca9a380d9c0b --- /dev/null +++ b/arch/x86/kernel/static_call.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/static_call.h> +#include <linux/memory.h> +#include <linux/bug.h> +#include <asm/text-patching.h> + +enum insn_type { + CALL = 0, /* site call */ + NOP = 1, /* site cond-call */ + JMP = 2, /* tramp / site tail-call */ + RET = 3, /* tramp / site cond-tail-call */ +}; + +static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) +{ + int size = CALL_INSN_SIZE; + const void *code; + + switch (type) { + case CALL: + code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + break; + + case NOP: + code = ideal_nops[NOP_ATOMIC5]; + break; + + case JMP: + code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); + break; + + case RET: + code = text_gen_insn(RET_INSN_OPCODE, insn, func); + size = RET_INSN_SIZE; + break; + } + + if (memcmp(insn, code, size) == 0) + return; + + if (unlikely(system_state == SYSTEM_BOOTING)) + return text_poke_early(insn, code, size); + + text_poke_bp(insn, code, size, NULL); +} + +static void __static_call_validate(void *insn, bool tail) +{ + u8 opcode = *(u8 *)insn; + + if (tail) { + if (opcode == JMP32_INSN_OPCODE || + opcode == RET_INSN_OPCODE) + return; + } else { + if (opcode == CALL_INSN_OPCODE || + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + return; + } + + /* + * If we ever trigger this, our text is corrupt, we'll probably not live long. + */ + WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); +} + +static inline enum insn_type __sc_insn(bool null, bool tail) +{ + /* + * Encode the following table without branches: + * + * tail null insn + * -----+-------+------ + * 0 | 0 | CALL + * 0 | 1 | NOP + * 1 | 0 | JMP + * 1 | 1 | RET + */ + return 2*tail + null; +} + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + mutex_lock(&text_mutex); + + if (tramp) { + __static_call_validate(tramp, true); + __static_call_transform(tramp, __sc_insn(!func, true), func); + } + + if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { + __static_call_validate(site, tail); + __static_call_transform(site, __sc_insn(!func, tail), func); + } + + mutex_unlock(&text_mutex); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index 720cde885042..6cf65397d225 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,6 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 81a2fb711091..3c70fb34028b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -43,6 +43,7 @@ #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> @@ -195,7 +196,7 @@ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) DEFINE_IDTENTRY(exc_divide_error) { - do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, + do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } @@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) +{ + unsigned long sp, *stack; + struct stack_info info; + struct pt_regs *regs_ret; + + /* + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ + if (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + sp = this_cpu_read(cpu_current_top_of_stack); + goto sync; + } + + /* + * From here on the RSP value is trusted. Now check whether entry + * happened from a safe stack. Not safe are the entry or unknown stacks, + * use the fall-back stack instead in this case. + */ + sp = regs->sp; + stack = (unsigned long *)sp; + + if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || + info.type >= STACK_TYPE_EXCEPTION_LAST) + sp = __this_cpu_ist_top_va(VC2); + +sync: + /* + * Found a safe stack - switch to it as if the entry didn't happen via + * IST stack. The code below only copies pt_regs, the real switch happens + * in assembly code. + */ + sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); + + regs_ret = (struct pt_regs *)sp; + *regs_ret = *regs; + + return regs_ret; +} +#endif + struct bad_iret_stack { void *error_entry_ret; struct pt_regs regs; @@ -745,9 +790,21 @@ static __always_inline unsigned long debug_read_clear_dr6(void) * Keep it simple: clear DR6 immediately. */ get_debugreg(dr6, 6); - set_debugreg(0, 6); - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; + set_debugreg(DR6_RESERVED, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + + /* + * Clear the virtual DR6 value, ptrace routines will set bits here for + * things we want signals for. + */ + current->thread.virtual_dr6 = 0; + + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); return dr6; } @@ -776,74 +833,20 @@ static __always_inline unsigned long debug_read_clear_dr6(void) * * May run on IST stack. */ -static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user) -{ - struct task_struct *tsk = current; - bool user_icebp; - int si_code; - - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_thread_flag(TIF_BLOCKSTEP); - - /* - * If DR6 is zero, no point in trying to handle it. The kernel is - * not using INT1. - */ - if (!user && !dr6) - return; +static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) +{ /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. + * Notifiers will clear bits in @dr6 to indicate the event has been + * consumed - hw_breakpoint_handler(), single_stop_cont(). + * + * Notifiers will set bits in @virtual_dr6 to indicate the desire + * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ - user_icebp = user && !dr6; - - /* Store the virtualized DR6 value */ - tsk->thread.debugreg6 = dr6; - -#ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) { - return; - } -#endif - - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, - SIGTRAP) == NOTIFY_STOP) { - return; - } - - /* It's safe to allow irq's after DR6 has been saved */ - cond_local_irq_enable(regs); - - if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, - X86_TRAP_DB); - goto out; - } - - if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { - /* - * Historical junk that used to handle SYSENTER single-stepping. - * This should be unreachable now. If we survive for a while - * without anyone hitting this warning, we'll turn this into - * an oops. - */ - tsk->thread.debugreg6 &= ~DR_STEP; - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - } - - si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, 0, si_code); + if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) + return true; -out: - cond_local_irq_disable(regs); + return false; } static __always_inline void exc_debug_kernel(struct pt_regs *regs, @@ -877,8 +880,32 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - handle_debug(regs, dr6, false); + if (kprobe_debug_handler(regs)) + goto out; + + /* + * The kernel doesn't use INT1 + */ + if (!dr6) + goto out; + + if (notify_debug(regs, &dr6)) + goto out; + /* + * The kernel doesn't use TF single-step outside of: + * + * - Kprobes, consumed through kprobe_debug_handler() + * - KGDB, consumed through notify_debug() + * + * So if we get here with DR_STEP set, something is wonky. + * + * A known way to trigger this is through QEMU's GDB stub, + * which leaks #DB into the guest and causes IST recursion. + */ + if (WARN_ON_ONCE(dr6 & DR_STEP)) + regs->flags &= ~X86_EFLAGS_TF; +out: instrumentation_end(); idtentry_exit_nmi(regs, irq_state); @@ -888,6 +915,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + bool icebp; + /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. @@ -906,8 +935,32 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, irqentry_enter_from_user_mode(regs); instrumentation_begin(); - handle_debug(regs, dr6, true); + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + icebp = !dr6; + + if (notify_debug(regs, &dr6)) + goto out; + /* It's safe to allow irq's after DR6 has been saved */ + local_irq_enable(); + + if (v8086_mode(regs)) { + handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); + goto out_irq; + } + + /* Add the virtual_dr6 bits for signals. */ + dr6 |= current->thread.virtual_dr6; + if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) + send_sigtrap(regs, 0, get_si_code(dr6)); + +out_irq: + local_irq_disable(); +out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -1074,6 +1127,9 @@ void __init trap_init(void) /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); + /* Init GHCB memory pages when running as an SEV-ES guest */ + sev_es_init_vc_handling(); + idt_setup_traps(); /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 49d925043171..f70dffc2771f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -54,7 +54,7 @@ struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ - seqcount_t seq; /* 32 + 4 = 36 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ @@ -73,14 +73,14 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) preempt_disable_notrace(); do { - seq = this_cpu_read(cyc2ns.seq.sequence); + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_end(void) @@ -186,7 +186,7 @@ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } @@ -203,7 +203,7 @@ static void __init cyc2ns_init_secondary_cpus(void) for_each_possible_cpu(cpu) { if (cpu != this_cpu) { - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 8d5cbe1bbb3b..f6225bf22c02 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -45,11 +45,12 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that SLDT and STR are not commonly used in programs that run on WineHQ - * or DOSEMU2, they are not emulated. - * - * The instruction smsw is emulated to return the value that the register CR0 + * The instruction SMSW is emulated to return the value that the register CR0 * has at boot time as set in the head_32. + * SLDT and STR are emulated to return the values that the kernel programmatically + * assigns: + * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. + * - STR returns (GDT_ENTRY_TSS * 8). * * Emulation is provided for both 32-bit and 64-bit processes. * @@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); - } else if (umip_inst == UMIP_INST_SMSW) { - unsigned long dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || + umip_inst == UMIP_INST_STR) { + unsigned long dummy_value; + + if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_STR) { + dummy_value = GDT_ENTRY_TSS * 8; + } else if (umip_inst == UMIP_INST_SLDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + down_read(¤t->mm->context.ldt_usr_sem); + if (current->mm->context.ldt) + dummy_value = GDT_ENTRY_LDT * 8; + else + dummy_value = 0; + up_read(¤t->mm->context.ldt_usr_sem); +#else + dummy_value = 0; +#endif + } /* - * Even though the CR0 register has 4 bytes, the number + * For these 3 instructions, the number * of bytes to be copied in the result buffer is determined * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes of CR0. + * siginificant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size = 2; memcpy(data, &dummy_value, *data_size); - /* STR and SLDT are not emulated */ } else { return -EINVAL; } @@ -317,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) */ bool fixup_umip_exception(struct pt_regs *regs) { - int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; - unsigned long seg_base = 0, *reg_addr; + int nr_copied, reg_offset, dummy_data_size, umip_inst; /* 10 bytes is the maximum size of the result of UMIP instructions */ unsigned char dummy_data[10] = { 0 }; unsigned char buf[MAX_INSN_SIZE]; + unsigned long *reg_addr; void __user *uaddr; struct insn insn; - int seg_defs; if (!regs) return false; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - - if (seg_base == -1L) - return false; - - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - sizeof(buf)); - nr_copied = sizeof(buf) - not_copied; + nr_copied = insn_fetch_from_user(regs, buf); /* - * The copy_from_user above could have failed if user code is protected - * by a memory protection key. Give up on emulation in such a case. - * Should we issue a page fault? + * The insn_fetch_from_user above could have failed if user code + * is protected by a memory protection key. Give up on emulation + * in such a case. Should we issue a page fault? */ if (!nr_copied) return false; - insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); - - /* - * Override the default operand and address sizes with what is specified - * in the code segment descriptor. The instruction decoder only sets - * the address size it to either 4 or 8 address bytes and does nothing - * for the operand bytes. This OK for most of the cases, but we could - * have special cases where, for instance, a 16-bit code segment - * descriptor is used. - * If there is an address override prefix, the instruction decoder - * correctly updates these values, even for 16-bit defaults. - */ - seg_defs = insn_get_code_seg_params(regs); - if (seg_defs == -EINVAL) - return false; - - insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); - insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); - - insn_get_length(&insn); - if (nr_copied < insn.length) + if (!insn_decode(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); @@ -383,10 +366,6 @@ bool fixup_umip_exception(struct pt_regs *regs) umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate (spoof) SLDT or STR. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) - return false; - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index ec88bbe08a32..6a339ce328e0 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/objtool.h> #include <linux/module.h> #include <linux/sort.h> #include <asm/ptrace.h> @@ -127,12 +128,12 @@ static struct orc_entry null_orc_entry = { .sp_offset = sizeof(long), .sp_reg = ORC_REG_SP, .bp_reg = ORC_REG_UNDEFINED, - .type = ORC_TYPE_CALL + .type = UNWIND_HINT_TYPE_CALL }; /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { - .type = ORC_TYPE_CALL, + .type = UNWIND_HINT_TYPE_CALL, .sp_reg = ORC_REG_BP, .sp_offset = 16, .bp_reg = ORC_REG_PREV_SP, @@ -531,7 +532,7 @@ bool unwind_next_frame(struct unwind_state *state) /* Find IP, SP and possibly regs: */ switch (orc->type) { - case ORC_TYPE_CALL: + case UNWIND_HINT_TYPE_CALL: ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) @@ -546,7 +547,7 @@ bool unwind_next_frame(struct unwind_state *state) state->signal = false; break; - case ORC_TYPE_REGS: + case UNWIND_HINT_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access registers at %pB\n", (void *)orig_ip); @@ -559,7 +560,7 @@ bool unwind_next_frame(struct unwind_state *state) state->signal = true; break; - case ORC_TYPE_REGS_IRET: + case UNWIND_HINT_TYPE_REGS_PARTIAL: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access iret registers at %pB\n", (void *)orig_ip); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9a03e5b23135..bf9e0adb5b7e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -136,6 +136,7 @@ SECTIONS ENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT + STATIC_CALL_TEXT *(.fixup) *(.gnu.warning) @@ -411,10 +412,47 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + ELF_DETAILS DISCARDS -} + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt (INFO) : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) *(.igot.*) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) *(.iplt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") +} #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 123f1c1f1788..a3038d8deb6a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -24,6 +24,7 @@ #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> +#include <asm/irqdomain.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,11 +68,7 @@ struct x86_init_ops x86_init __initdata = { }, .mpparse = { - .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, - .mpc_apic_id = default_mpc_apic_id, - .smp_read_mpc_oem = default_smp_read_mpc_oem, - .mpc_oem_bus_info = default_mpc_oem_bus_info, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, @@ -80,7 +77,8 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .intr_mode_select = apic_intr_mode_select, - .intr_mode_init = apic_intr_mode_init + .intr_mode_init = apic_intr_mode_init, + .create_pci_msi_domain = native_create_pci_msi_domain, }, .oem = { @@ -148,28 +146,10 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi __ro_after_init = { - .setup_msi_irqs = native_setup_msi_irqs, - .teardown_msi_irq = native_teardown_msi_irq, - .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; /* MSI arch specific hooks */ -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - return x86_msi.setup_msi_irqs(dev, nvec, type); -} - -void arch_teardown_msi_irqs(struct pci_dev *dev) -{ - x86_msi.teardown_msi_irqs(dev); -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - x86_msi.teardown_msi_irq(irq); -} - void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbd5bd7a945a..f92dfd8ef10d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -66,6 +66,7 @@ config KVM_WERROR default y if X86_64 && !KASAN # We use the dependency on !COMPILE_TEST to not be enabled # blindly in allmodconfig or allyesconfig configurations + depends on KVM depends on (X86_64 && !KASAN) || !COMPILE_TEST depends on EXPERT help diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4a3081e9f4b5..b804444e16d4 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,9 +15,11 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o -kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3fd6eec202d7..06a278b3701d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,7 +54,24 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -static int kvm_check_cpuid(struct kvm_vcpu *vcpu) +static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *e; + int i; + + for (i = 0; i < nent; i++) { + e = &entries[i]; + + if (e->function == function && (e->index == index || + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + return e; + } + + return NULL; +} + +static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; @@ -62,7 +79,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, 0); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -107,6 +124,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) @@ -121,8 +145,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); - best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -146,7 +168,9 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_x86_ops.update_exception_bitmap(vcpu); + + /* Invoke the vendor callback only after the above state is updated. */ + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); } static int is_efer_nx(void) @@ -186,7 +210,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, @@ -194,46 +217,53 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry __user *entries) { int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; + struct kvm_cpuid_entry *e = NULL; + struct kvm_cpuid_entry2 *e2 = NULL; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; + return -E2BIG; + if (cpuid->nent) { - cpuid_entries = vmemdup_user(entries, - array_size(sizeof(struct kvm_cpuid_entry), - cpuid->nent)); - if (IS_ERR(cpuid_entries)) { - r = PTR_ERR(cpuid_entries); - goto out; + e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + if (IS_ERR(e)) + return PTR_ERR(e); + + e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); + if (!e2) { + r = -ENOMEM; + goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; + e2[i].function = e[i].function; + e2[i].eax = e[i].eax; + e2[i].ebx = e[i].ebx; + e2[i].ecx = e[i].ecx; + e2[i].edx = e[i].edx; + e2[i].index = 0; + e2[i].flags = 0; + e2[i].padding[0] = 0; + e2[i].padding[1] = 0; + e2[i].padding[2] = 0; } - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - kvfree(cpuid_entries); - goto out; + kvfree(e2); + goto out_free_cpuid; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); - kvfree(cpuid_entries); -out: +out_free_cpuid: + kvfree(e); + return r; } @@ -241,26 +271,32 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { + struct kvm_cpuid_entry2 *e2 = NULL; int r; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + return -E2BIG; + + if (cpuid->nent) { + e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + if (IS_ERR(e2)) + return PTR_ERR(e2); + } + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - goto out; + kvfree(e2); + return r; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); -out: - return r; + + return 0; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -371,7 +407,7 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) + F(SERIALIZE) | F(TSXLDTRK) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -941,17 +977,8 @@ out_free: struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *e; - int i; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - - if (e->function == function && (e->index == index || - !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) - return e; - } - return NULL; + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 3a923ae15f2f..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include <asm/cpu.h> #include <asm/processor.h> +#include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -34,6 +35,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + struct cpuid_reg { u32 function; u32 index; @@ -308,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5299ef5ff18d..0d917eb70319 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7fc8); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7fc4); set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); @@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; val = GET_SMSTATE(u32, smstate, 0x7f68); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7f60); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); cr3 = GET_SMSTATE(u64, smstate, 0x7f50); cr4 = GET_SMSTATE(u64, smstate, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); val = GET_SMSTATE(u64, smstate, 0x7ed0); - ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7e90); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); @@ -3594,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) u64 tsc_aux = 0; if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) - return emulate_gp(ctxt, 0); + return emulate_ud(ctxt); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } @@ -3689,21 +3701,35 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r > 0) return emulate_gp(ctxt, 0); - return X86EMUL_CONTINUE; + return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; + + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1d330564eed8..5c7c4060b45c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); @@ -2000,20 +2011,20 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; case HYPERV_CPUID_FEATURES: - ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE; + ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; - ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE; + ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; - ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE; - ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE; - ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; - ent->eax |= HV_X64_MSR_RESET_AVAILABLE; + ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; + ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; + ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; - ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; - ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_FREQUENCY_MSRS; + ent->eax |= HV_ACCESS_REENLIGHTENMENT; - ent->ebx |= HV_X64_POST_MESSAGES; - ent->ebx |= HV_X64_SIGNAL_EVENTS; + ent->ebx |= HV_POST_MESSAGES; + ent->ebx |= HV_SIGNAL_EVENTS; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d057376bd3d3..698969e18fe3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -197,12 +197,9 @@ static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) /* * If no longer has pending EOI in LAPICs, update - * EOI for this vetor. + * EOI for this vector. */ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector); - kvm_ioapic_update_eoi_one(vcpu, ioapic, - entry->fields.trig_mode, - irq); break; } } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cfe83d4ae625..a889563ad02d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 35cca2e0c802..105e7859d1f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) +{ + kvm_lapic_set_reg(apic, APIC_DFR, val); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); +} + static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); @@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) +{ + apic_clear_irr(vec, vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { struct kvm_vcpu *vcpu; @@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) { - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); - } else + if (!apic_x2apic_mode(apic)) + kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); + else ret = 1; break; @@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || - !apic_lvtt_tscdeadline(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); @@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) @@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } +EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 754f29beb83e..4fb86e3a9dd3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5efc6081ca13..9c4a9c8e43d9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 43fdb0c12a5d..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,10 +19,12 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -45,7 +47,6 @@ #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> -#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; @@ -104,45 +105,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +125,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,42 +132,11 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include <trace/events/kvm.h> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault and mmu.page_fault: - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; @@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - - access = mask & ACC_ALL; - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } @@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -796,12 +428,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -976,34 +587,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -1640,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1666,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1710,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } @@ -1769,13 +1362,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1919,12 +1507,26 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1973,12 +1575,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG @@ -2469,7 +2083,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } if (sp->unsync_children) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count(sp); @@ -2577,13 +2191,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2615,8 +2223,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,23 +2239,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); - } - return true; - } - if (is_mmio_spte(pte)) + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); + } + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2692,7 +2312,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -2885,8 +2505,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2946,132 +2566,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte = 0; - int ret = 0; + u64 spte; struct kvm_mmu_page *sp; + int ret; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) { + if (spte & PT_WRITABLE_MASK) kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= spte_shadow_dirty_mask(spte); - } - if (speculative) - spte = mark_spte_for_access_track(spte); - -set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3113,6 +2643,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3161,7 +2700,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } @@ -3239,8 +2778,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3248,6 +2788,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3272,7 +2814,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3285,14 +2834,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; - u64 spte = *it.sptep; + int level = *goal_levelp; - if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3302,26 +2849,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, ret; + int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -3329,7 +2882,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gfn, it.level, + &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) @@ -3341,7 +2896,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (account_disallowed_nx_lpage) + if (is_tdp && huge_page_disallowed && + req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } @@ -3349,6 +2905,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, write, level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + direct_pte_prefetch(vcpu, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -3479,21 +3038,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Return value: - * - true: let the vcpu to access on the same address again. - * - false: let the real page fault path to fix it. + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool fault_handled = false; + int ret = RET_PF_INVALID; u64 spte = 0ull; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) - return false; + return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3519,7 +3076,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * they are always ACC_ALL. */ if (is_access_allowed(error_code, spte)) { - fault_handled = true; + ret = RET_PF_SPURIOUS; break; } @@ -3562,11 +3119,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte, - new_spte); - if (fault_handled) + if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, + new_spte)) { + ret = RET_PF_FIXED; break; + } if (++retry_count > 4) { printk_once(KERN_WARNING @@ -3577,10 +3134,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, fault_handled); + spte, ret); walk_shadow_page_lockless_end(vcpu); - return fault_handled; + return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -3592,9 +3149,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + if (kvm_mmu_put_root(kvm, sp)) { + if (sp->tdp_mmu_page) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3603,6 +3164,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free) { + struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; @@ -3620,22 +3182,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&vcpu->kvm->mmu_lock); + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) - mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, - &invalid_list); + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, + mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->root_hpa = INVALID_PAGE; @@ -3643,8 +3204,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->root_pgd = 0; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3684,8 +3245,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (vcpu->kvm->arch.tdp_mmu_enabled) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) + return -ENOSPC; + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, + true); + if (!VALID_PAGE(root)) return -ENOSPC; vcpu->arch.mmu->root_hpa = root; @@ -3910,54 +3479,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -/* return true if reserved bit is detected on spte. */ -static bool -walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; - struct rsvd_bits_validate *rsvd_check; - int root, leaf; - bool reserved = false; + int leaf = vcpu->arch.mmu->root_level; + u64 spte; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), - leaf = root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; - leaf--; if (!is_shadow_present_pte(spte)) break; + } + + walk_shadow_page_lockless_end(vcpu); + + return leaf; +} + +/* return true if reserved bit is detected on spte. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL]; + struct rsvd_bits_validate *rsvd_check; + int root = vcpu->arch.mmu->root_level; + int leaf; + int level; + bool reserved = false; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { + *sptep = 0ull; + return reserved; + } + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + else + leaf = get_walk(vcpu, addr, sptes); + + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + + for (level = root; level >= leaf; level--) { + if (!is_shadow_present_pte(sptes[level - 1])) + break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, spte) | - __is_rsvd_bits_set(rsvd_check, spte, iterator.level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | + __is_rsvd_bits_set(rsvd_check, sptes[level - 1], + level); } - walk_shadow_page_lockless_end(vcpu); - if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root > leaf) { + for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[root - 1], root); - root--; - } + sptes[level - 1], level); } - *sptep = spte; + *sptep = sptes[leaf - 1]; + return reserved; } @@ -3969,7 +3566,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; - reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) return -EINVAL; @@ -4080,8 +3677,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool lpage_disallowed = exec && is_nx_huge_page_enabled(); bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -4092,16 +3687,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) return r; - if (lpage_disallowed) - max_level = PG_LEVEL_4K; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4118,8 +3713,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, - prefault, is_tdp && lpage_disallowed); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -4292,7 +3892,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); + /* + * If this is a direct root page, it doesn't have a write flooding + * count. Otherwise, clear the write flooding count. + */ + if (!new_role.direct) + __clear_sp_write_flooding_count( + to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -5400,7 +5006,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u32 base_role = vcpu->arch.mmu->mmu_role.base.word; entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && !((sp->role.word ^ base_role) & ~role_ign.word) && rmap_can_add(vcpu)) @@ -5450,13 +5056,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - WARN_ON(r == RET_PF_INVALID); + if (WARN_ON_ONCE(r == RET_PF_INVALID)) + return -EIO; } - if (r == RET_PF_RETRY) - return 1; if (r < 0) return r; + if (r != RET_PF_EMULATE) + return 1; /* * Before emulating the instruction, check if the error code @@ -5485,18 +5092,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: - /* - * On AMD platforms, under certain conditions insn_len may be zero on #NPF. - * This can happen if a guest gets a page-fault on data access but the HW - * table walker is not able to read the instruction page (e.g instruction - * page is not present in memory). In those cases we simply restart the - * guest, with the exception of AMD Erratum 1096 which is unrecoverable. - */ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) - return 1; - } - return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } @@ -5682,11 +5277,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; + mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + mmu->translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5712,7 +5313,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - uint i; int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; @@ -5726,25 +5326,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_pgd = 0; - vcpu->arch.root_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_pgd = 0; - vcpu->arch.guest_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; @@ -5841,6 +5429,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -5860,6 +5452,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + kvm_mmu_init_tdp_mmu(kvm); + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); @@ -5870,6 +5464,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; kvm_page_track_unregister_notifier(kvm, node); + + kvm_mmu_uninit_tdp_mmu(kvm); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -5877,6 +5473,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; + bool flush; spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5896,6 +5493,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } + if (kvm->arch.tdp_mmu_enabled) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); + } + spin_unlock(&kvm->mmu_lock); } @@ -5914,6 +5517,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); spin_unlock(&kvm->mmu_lock); /* @@ -5977,6 +5582,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); spin_unlock(&kvm->mmu_lock); } @@ -6002,6 +5610,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); spin_unlock(&kvm->mmu_lock); /* @@ -6023,6 +5633,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6037,6 +5649,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6062,6 +5676,10 @@ restart: } kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -6357,7 +5975,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; - while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + for ( ; to_zap; --to_zap) { + if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + break; + /* * We use a separate list instead of just using active_mmu_pages * because the number of lpage_disallowed pages is expected to @@ -6367,15 +5988,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + if (sp->tdp_mmu_page) + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } - if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (to_zap) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 3acf3b8eb469..bfc6389edc28 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -3,9 +3,23 @@ #define __KVM_X86_MMU_INTERNAL_H #include <linux/types.h> - +#include <linux/kvm_host.h> #include <asm/kvm_host.h> +#undef MMU_DEBUG + +#ifdef MMU_DEBUG +extern bool dbg; + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define MMU_WARN_ON(x) WARN_ON(x) +#else +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) +#define MMU_WARN_ON(x) do { } while (0) +#endif + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -41,8 +55,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; + + bool tdp_mmu_page; }; +extern struct kmem_cache *mmu_page_header_cache; + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -55,9 +73,77 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + +bool is_nx_huge_page_enabled(void); +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync); + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages); + +static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + BUG_ON(!sp->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++sp->root_count; +} + +static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held(&kvm->mmu_lock); + --sp->root_count; + + return !sp->root_count; +} + +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 9d15bc0c535b..213699b27b44 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), + TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte), + TP_ARGS(sptep, gfn, spte), TP_STRUCT__entry( __field(void *, sptep) @@ -215,8 +215,8 @@ TRACE_EVENT( TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; + __entry->access = spte & ACC_ALL; + __entry->gen = get_mmio_spte_generation(spte); ), TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, @@ -244,14 +244,11 @@ TRACE_EVENT( __entry->access) ); -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - TRACE_EVENT( fast_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + u64 *sptep, u64 old_spte, int ret), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -260,7 +257,7 @@ TRACE_EVENT( __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) - __field(bool, retry) + __field(int, ret) ), TP_fast_assign( @@ -270,7 +267,7 @@ TRACE_EVENT( __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; - __entry->retry = retry; + __entry->ret = ret; ), TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" @@ -278,7 +275,7 @@ TRACE_EVENT( __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) + __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED ) ); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a84a141a2ad2..8443a675715b 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -229,7 +229,8 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) n->track_write(vcpu, gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); @@ -254,7 +255,8 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_flush_slot) n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4dd6b1e5b8cf..50e268eb8e1a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) + struct guest_walker *gw, u32 error_code, + int max_level, kvm_pfn_t pfn, bool map_writable, + bool prefault) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write_fault = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; + int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -690,10 +694,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, + &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + if (it.level == level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -704,13 +710,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (huge_page_disallowed && req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, it.level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -738,7 +747,7 @@ out_gpte_changed: */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, + struct guest_walker *walker, bool user_fault, bool *write_fault_to_shadow_pgtable) { int level; @@ -776,15 +785,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; + bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -825,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (lpage_disallowed || is_self_change_mapping) + if (is_self_change_mapping) max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -869,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: @@ -895,6 +902,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + u64 old_spte; int level; u64 *sptep; @@ -917,7 +925,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sptep = iterator.sptep; sp = sptep_to_sp(sptep); - if (is_last_spte(*sptep, level)) { + old_spte = *sptep; + if (is_last_spte(old_spte, level)) { pt_element_t gpte; gpa_t pte_gpa; @@ -927,7 +936,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); + if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c new file mode 100644 index 000000000000..d9c5665a55e9 --- /dev/null +++ b/arch/x86/kvm/mmu/spte.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * Macros and functions to access KVM PTEs (also known as SPTEs) + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2020 Red Hat, Inc. and/or its affiliates. + */ + + +#include <linux/kvm_host.h> +#include "mmu.h" +#include "mmu_internal.h" +#include "x86.h" +#include "spte.h" + +#include <asm/e820/api.h> + +u64 __read_mostly shadow_nx_mask; +u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_accessed_mask; +u64 __read_mostly shadow_dirty_mask; +u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_access_mask; +u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_mask; +u64 __read_mostly shadow_acc_track_mask; + +u64 __read_mostly shadow_nonpresent_or_rsvd_mask; +u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +u8 __read_mostly shadow_phys_bits; + +static u64 generation_mmio_spte_mask(u64 gen) +{ + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; +} + +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +{ + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; + u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; + + access &= shadow_mmio_access_mask; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; + + return mask; +} + +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); + + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); +} + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) +{ + u64 spte = 0; + int ret = 0; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; + + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PG_LEVEL_4K) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(old_spte)) + goto out; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret |= SET_SPTE_WRITE_PROTECTED_PT; + pte_access &= ~ACC_WRITE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + } + } + + if (pte_access & ACC_WRITE_MASK) + spte |= spte_shadow_dirty_mask(spte); + + if (speculative) + spte = mark_spte_for_access_track(spte); + +out: + *new_spte = spte; + return ret; +} + +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +u64 mark_spte_for_access_track(u64 spte) +{ + if (spte_ad_enabled(spte)) + return spte & ~shadow_accessed_mask; + + if (is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + + return spte; +} + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +{ + BUG_ON((u64)(unsigned)access_mask != access_mask); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + shadow_mmio_access_mask = access_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask, u64 me_mask) +{ + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; + shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +void kvm_mmu_reset_all_pte_masks(void) +{ + u8 low_phys_bits; + + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; + + shadow_phys_bits = kvm_get_shadow_phys_bits(); + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. + */ + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } + + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); +} diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h new file mode 100644 index 000000000000..4ecf40e0b8fe --- /dev/null +++ b/arch/x86/kvm/mmu/spte.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_X86_MMU_SPTE_H +#define KVM_X86_MMU_SPTE_H + +#include "mmu_internal.h" + +#define PT_FIRST_AVAIL_BITS_SHIFT 10 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +/* + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + +extern u64 __read_mostly shadow_nx_mask; +extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_accessed_mask; +extern u64 __read_mostly shadow_dirty_mask; +extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_access_mask; +extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +extern u64 __read_mostly shadow_acc_track_mask; + +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline bool is_mmio_spte(u64 spte) +{ + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; +} + +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + +static inline int is_shadow_present_pte(u64 pte) +{ + return (pte != 0) && !is_mmio_spte(pte); +} + +static inline int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static inline int is_last_spte(u64 pte, int level) +{ + if (level == PG_LEVEL_4K) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static inline bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + +static inline kvm_pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static inline bool is_accessed_spte(u64 spte) +{ + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); +} + +static inline bool is_dirty_spte(u64 spte) +{ + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; +} + +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + +static inline u64 get_mmio_spte_generation(u64 spte) +{ + u64 gen; + + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; +} + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 mark_spte_for_access_track(u64 spte); +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); + +void kvm_mmu_reset_all_pte_masks(void); + +#endif diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c new file mode 100644 index 000000000000..87b7e16911db --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu_internal.h" +#include "tdp_iter.h" +#include "spte.h" + +/* + * Recalculates the pointer to the SPTE for the current GFN and level and + * reread the SPTE. + */ +static void tdp_iter_refresh_sptep(struct tdp_iter *iter) +{ + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + iter->old_spte = READ_ONCE(*iter->sptep); +} + +static gfn_t round_gfn_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + +/* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure + * rooted at root_pt, starting with the walk to translate goal_gfn. + */ +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn) +{ + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->goal_gfn = goal_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* + * Given an SPTE and its level, returns a pointer containing the host virtual + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +u64 *spte_to_child_pt(u64 spte, int level) +{ + /* + * There's no child entry if this entry isn't present or is a + * last-level entry. + */ + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + + return __va(spte_to_pfn(spte) << PAGE_SHIFT); +} + +/* + * Steps down one level in the paging structure towards the goal GFN. Returns + * true if the iterator was able to step down a level, false otherwise. + */ +static bool try_step_down(struct tdp_iter *iter) +{ + u64 *child_pt; + + if (iter->level == iter->min_level) + return false; + + /* + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ + iter->old_spte = READ_ONCE(*iter->sptep); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) + return false; + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Steps to the next entry in the current page table, at the current page table + * level. The next entry could point to a page backing guest memory or another + * page table, or it could be non-present. Returns true if the iterator was + * able to step to the next entry in the page table, false if the iterator was + * already at the end of the current page table. + */ +static bool try_step_side(struct tdp_iter *iter) +{ + /* + * Check if the iterator is already at the end of the current page + * table. + */ + if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (PT64_ENT_PER_PAGE - 1)) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->goal_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + + return true; +} + +/* + * Tries to traverse back up a level in the paging structure so that the walk + * can continue from the next entry in the parent page table. Returns true on a + * successful step up, false if already in the root page. + */ +static bool try_step_up(struct tdp_iter *iter) +{ + if (iter->level == iter->root_level) + return false; + + iter->level++; + iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Step to the next SPTE in a pre-order traversal of the paging structure. + * To get to the next SPTE, the iterator either steps down towards the goal + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a + * highter GFN. + * + * The basic algorithm is as follows: + * 1. If the current SPTE is a non-last-level SPTE, step down into the page + * table it points to. + * 2. If the iterator cannot step down, it will try to step to the next SPTE + * in the current page of the paging structure. + * 3. If the iterator cannot step to the next entry in the current page, it will + * try to step up to the parent paging structure page. In this case, that + * SPTE will have already been visited, and so the iterator must also step + * to the side again. + */ +void tdp_iter_next(struct tdp_iter *iter) +{ + if (try_step_down(iter)) + return; + + do { + if (try_step_side(iter)) + return; + } while (try_step_up(iter)); + iter->valid = false; +} + +/* + * Restart the walk over the paging structure from the root, starting from the + * highest gfn the iterator had previously reached. Assumes that the entire + * paging structure, except the root page, may have been completely torn down + * and rebuilt. + */ +void tdp_iter_refresh_walk(struct tdp_iter *iter) +{ + gfn_t goal_gfn = iter->goal_gfn; + + if (iter->gfn > goal_gfn) + goal_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, goal_gfn); +} + +u64 *tdp_iter_root_pt(struct tdp_iter *iter) +{ + return iter->pt_path[iter->root_level - 1]; +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h new file mode 100644 index 000000000000..47170d0dc98e --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_ITER_H +#define __KVM_X86_MMU_TDP_ITER_H + +#include <linux/kvm_host.h> + +#include "mmu.h" + +/* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +struct tdp_iter { + /* + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ + gfn_t goal_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ + u64 *sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ + int root_level; + /* The lowest level the iterator should traverse to */ + int min_level; + /* The iterator's current level within the paging structure */ + int level; + /* A snapshot of the value at sptep */ + u64 old_spte; + /* + * Whether the iterator has a valid state. This will be false if the + * iterator walks off the end of the paging structure. + */ + bool valid; +}; + +/* + * Iterates over every SPTE mapping the GFN range [start, end) in a + * preorder traversal. + */ +#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, min_level, start); \ + iter.valid && iter.gfn < end; \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + +u64 *spte_to_child_pt(u64 pte, int level); + +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn); +void tdp_iter_next(struct tdp_iter *iter); +void tdp_iter_refresh_walk(struct tdp_iter *iter); +u64 *tdp_iter_root_pt(struct tdp_iter *iter); + +#endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c new file mode 100644 index 000000000000..27e381c9da6c --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -0,0 +1,1157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu.h" +#include "mmu_internal.h" +#include "mmutrace.h" +#include "tdp_iter.h" +#include "tdp_mmu.h" +#include "spte.h" + +#ifdef CONFIG_X86_64 +static bool __read_mostly tdp_mmu_enabled = false; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +#endif + +static bool is_tdp_mmu_enabled(void) +{ +#ifdef CONFIG_X86_64 + return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +#else + return false; +#endif /* CONFIG_X86_64 */ +} + +/* Initializes the TDP MMU for the VM, if enabled. */ +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +{ + if (!is_tdp_mmu_enabled()) + return; + + /* This should not be changed for the lifetime of the VM. */ + kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); +} + +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) +{ + if (!kvm->arch.tdp_mmu_enabled) + return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); +} + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + sp = to_shadow_page(hpa); + + return sp->tdp_mmu_page && sp->root_count; +} + +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield); + +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + + lockdep_assert_held(&kvm->mmu_lock); + + WARN_ON(root->root_count); + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + zap_gfn_range(kvm, root, 0, max_gfn, false); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +} + +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, + int level) +{ + union kvm_mmu_page_role role; + + role = vcpu->arch.mmu->mmu_role.base; + role.level = level; + role.direct = true; + role.gpte_is_8_bytes = true; + role.access = ACC_ALL; + + return role; +} + +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + sp->role.word = page_role_for_level(vcpu, level).word; + sp->gfn = gfn; + sp->tdp_mmu_page = true; + + return sp; +} + +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + + spin_lock(&kvm->mmu_lock); + + /* Check for an existing root before allocating a new one. */ + for_each_tdp_mmu_root(kvm, root) { + if (root->role.word == role.word) { + kvm_mmu_get_root(kvm, root); + spin_unlock(&kvm->mmu_lock); + return root; + } + } + + root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root->root_count = 1; + + list_add(&root->link, &kvm->arch.tdp_mmu_roots); + + spin_unlock(&kvm->mmu_lock); + + return root; +} + +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root; + + root = get_tdp_mmu_vcpu_root(vcpu); + if (!root) + return INVALID_PAGE; + + return __pa(root->spt); +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + +static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) + return; + + if (is_accessed_spte(old_spte) && + (!is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); +} + +static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed; + struct kvm_memory_slot *slot; + + if (level > PG_LEVEL_4K) + return; + + pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if ((!is_writable_pte(old_spte) || pfn_changed) && + is_writable_pte(new_spte)) { + slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); + mark_page_dirty_in_slot(slot, gfn); + } +} + +/** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance + * @as_id: the address space of the paging structure the SPTE was a part of + * @gfn: the base GFN that was mapped by the SPTE + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ +static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + u64 *pt; + struct kvm_mmu_page *sp; + u64 old_child_spte; + int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); + WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + + /* + * If this warning were to trigger it would indicate that there was a + * missing MMU notifier or a race with some notifier handler. + * A present, leaf SPTE should never be directly replaced with another + * present leaf SPTE pointing to a differnt PFN. A notifier handler + * should be zapping the SPTE before the main MM's page table is + * changed, or the SPTE should be zeroed, and the TLBs flushed by the + * thread before replacement. + */ + if (was_leaf && is_leaf && pfn_changed) { + pr_err("Invalid SPTE change: cannot replace a present leaf\n" + "SPTE with another present leaf SPTE mapping a\n" + "different PFN!\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + + /* + * Crash the host to prevent error propagation and guest data + * courruption. + */ + BUG(); + } + + if (old_spte == new_spte) + return; + + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ + * removed. In that case, there is nothing to do here. + */ + if (!was_present && !is_present) { + /* + * If this change does not involve a MMIO SPTE, it is + * unexpected. Log the change, though it should not impact the + * guest since both the former and current SPTEs are nonpresent. + */ + if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" + "should not be replaced with another,\n" + "different nonpresent SPTE, unless one or both\n" + "are MMIO SPTEs.\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + return; + } + + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_dirty_spte(new_spte) || pfn_changed)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + /* + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) { + pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + + list_del(&sp->link); + + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, as_id, + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); + } +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} + +static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + WRITE_ONCE(*iter->sptep, new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); + if (record_dirty_log) + handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + iter->old_spte, new_spte, + iter->level); +} + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); +} + +static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); +} + +static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); +} + +#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ + for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + +#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _root, _start, _end) \ + if (!is_shadow_present_pte(_iter.old_spte) || \ + !is_last_spte(_iter.old_spte, _iter.level)) \ + continue; \ + else + +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + +/* + * Flush the TLB if the process should drop kvm->mmu_lock. + * Return whether the caller still needs to flush the tlb. + */ +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return false; + } else { + return true; + } +} + +static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + } +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + * If can_yield is true, will release the MMU lock and reschedule if the + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the + * operation can cause a soft lockup. + */ +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield) +{ + struct tdp_iter iter; + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a + * lower level. + */ + if ((iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + if (can_yield) + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + else + flush_needed = true; + } + return flush_needed; +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_mmu_page *root; + bool flush = false; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + flush |= zap_gfn_range(kvm, root, start, end, true); + + kvm_mmu_put_root(kvm, root); + } + + return flush; +} + +void kvm_tdp_mmu_zap_all(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + bool flush; + + flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + } else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; + else + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_page *sp; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); + if (huge_page_disallowed && req_level >= iter.level) + account_huge_nx_page(vcpu->kvm, sp); + + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + if (WARN_ON(iter.level != level)) + return RET_PF_RETRY; + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} + +static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end, unsigned long data, + int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long data)) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + struct kvm_mmu_page *root; + int ret = 0; + int as_id; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + as_id = kvm_mmu_page_as_id(root); + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } + + kvm_mmu_put_root(kvm, root); + } + + return ret; +} + +static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) +{ + return zap_gfn_range(kvm, root, start, end, false); +} + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + zap_gfn_range_hva_wrapper); +} + +/* + * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero + * if any of the GFNs in the range have been accessed. + */ +static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long unused) +{ + struct tdp_iter iter; + int young = 0; + u64 new_spte = 0; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the + * pte. + */ + if (!is_accessed_spte(iter.old_spte)) + continue; + + new_spte = iter.old_spte; + + if (spte_ad_enabled(new_spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)&new_spte); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + + new_spte = mark_spte_for_access_track(new_spte); + } + new_spte &= ~shadow_dirty_mask; + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; + } + + return young; +} + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + age_gfn_range); +} + +static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long unused2) +{ + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + if (is_accessed_spte(iter.old_spte)) + return 1; + + return 0; +} + +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, + test_age_gfn); +} + +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long data) +{ + struct tdp_iter iter; + pte_t *ptep = (pte_t *)data; + kvm_pfn_t new_pfn; + u64 new_spte; + int need_flush = 0; + + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); + + tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + if (iter.level != PG_LEVEL_4K) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + break; + + tdp_mmu_set_spte(kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + + if (!pte_write(*ptep)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + iter.old_spte, new_pfn); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + } + + need_flush = 1; + } + + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + + return 0; +} + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, + (unsigned long)host_ptep, + set_tdp_spte); +} + +/* + * Remove write access from all the SPTEs mapping GFNs [start, end). If + * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int min_level) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Remove write access from all the SPTEs mapping GFNs in the memslot. Will + * only affect leaf SPTEs down to min_level. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages, min_level); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn, unsigned long mask, bool wrprot) +{ + struct tdp_iter iter; + u64 new_spte; + + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) + break; + + if (iter.level > PG_LEVEL_4K || + !(mask & (1UL << (iter.gfn - gfn)))) + continue; + + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + + mask &= ~(1UL << (iter.gfn - gfn)); + } +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); + } +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + new_spte = iter.old_spte | shadow_dirty_mask; + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + + return spte_set; +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + return spte_set; +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +static void zap_collapsible_spte_range(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + kvm_pfn_t pfn; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + is_last_spte(iter.old_spte, iter.level)) + continue; + + pfn = spte_to_pfn(iter.old_spte); + if (kvm_is_reserved_pfn(pfn) || + !PageTransCompoundMap(pfn_to_page(pfn))) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + zap_collapsible_spte_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; + + new_spte = iter.old_spte & + ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + } + + return spte_set; +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + spte_set |= write_protect_gfn(kvm, root, gfn); + } + return spte_set; +} + +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + int leaf = vcpu->arch.mmu->shadow_root_level; + gfn_t gfn = addr >> PAGE_SHIFT; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + + return leaf; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h new file mode 100644 index 000000000000..556e065503f6 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_MMU_H +#define __KVM_X86_MMU_TDP_MMU_H + +#include <linux/kvm_host.h> + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault); + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep); + +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level); +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + struct kvm_memory_slot *slot); +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot); +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); + +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +#endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ac830cd50830..8c550999ace0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT); + p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!p_page) goto free_avic; kvm_svm->avic_physical_id_table_page = p_page; - clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT); + l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!l_page) goto free_avic; kvm_svm->avic_logical_id_table_page = l_page; - clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: @@ -868,6 +866,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * - Tell IOMMU to use legacy mode for this interrupt. * - Retrieve ga_tag of prior interrupt remapping data. */ + pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fb68467e6049..9e4c226dbf7d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; + unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested.ctl; - svm->nested.host_intercept_exceptions = h->intercept_exceptions; - - c->intercept_cr = h->intercept_cr; - c->intercept_dr = h->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions; - c->intercept = h->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ - c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); - c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + vmcb_clr_intercept(c, INTERCEPT_CR8_READ); + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); /* * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not * affect any interrupt we may want to inject; therefore, * interrupt window vmexits are irrelevant to L0. */ - c->intercept &= ~(1ULL << INTERCEPT_VINTR); + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* We don't want to see VMMCALLs from a nested guest */ - c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); - c->intercept_cr |= g->intercept_cr; - c->intercept_dr |= g->intercept_dr; - c->intercept_exceptions |= g->intercept_exceptions; - c->intercept |= g->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] |= g->intercepts[i]; } static void copy_vmcb_control_area(struct vmcb_control_area *dst, struct vmcb_control_area *from) { - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; - dst->intercept_exceptions = from->intercept_exceptions; - dst->intercept = from->intercept; + unsigned int i; + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; @@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) return false; if (control->asid == 0) @@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) { - bool nested_vmcb_lma; - if ((vmcb->save.efer & EFER_SVME) == 0) + bool vmcb12_lma; + + if ((vmcb12->save.efer & EFER_SVME) == 0) return false; - if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && - (vmcb->save.cr0 & X86_CR0_NW)) + if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) return false; - if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) return false; - nested_vmcb_lma = - (vmcb->save.efer & EFER_LME) && - (vmcb->save.cr0 & X86_CR0_PG); + vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!nested_vmcb_lma) { - if (vmcb->save.cr4 & X86_CR4_PAE) { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + if (!vmcb12_lma) { + if (vmcb12->save.cr4 & X86_CR4_PAE) { + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) return false; } else { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) return false; } } else { - if (!(vmcb->save.cr4 & X86_CR4_PAE) || - !(vmcb->save.cr0 & X86_CR0_PE) || - (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + if (!(vmcb12->save.cr4 & X86_CR4_PAE) || + !(vmcb12->save.cr0 & X86_CR0_PE) || + (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb->control); + return nested_vmcb_check_controls(&vmcb12->control); } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * EXIT_INT_INFO. */ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *nested_vmcb) + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, if (vcpu->arch.exception.has_error_code) { exit_int_info |= SVM_EVTINJ_VALID_ERR; - nested_vmcb->control.exit_int_info_err = + vmcb12->control.exit_int_info_err = vcpu->arch.exception.error_code; } @@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, exit_int_info |= SVM_EVTINJ_TYPE_INTR; } - nested_vmcb->control.exit_int_info = exit_int_info; + vmcb12->control.exit_int_info = exit_int_info; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); - kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); - kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + svm_set_efer(&svm->vcpu, vmcb12->save.efer); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); + svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); + kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); + kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; + svm->vmcb->save.rax = vmcb12->save.rax; + svm->vmcb->save.rsp = vmcb12->save.rsp; + svm->vmcb->save.rip = vmcb12->save.rip; + svm->vmcb->save.dr7 = vmcb12->save.dr7; + svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.cpl = vmcb12->save.cpl; } static void nested_prepare_vmcb_control(struct vcpu_svm *svm) @@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb) +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, + struct vmcb *vmcb12) { int ret; - svm->nested.vmcb = vmcb_gpa; - load_nested_vmcb_control(svm, &nested_vmcb->control); - nested_prepare_vmcb_save(svm, nested_vmcb); + svm->nested.vmcb12_gpa = vmcb12_gpa; + load_nested_vmcb_control(svm, &vmcb12->control); + nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); - ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); if (ret) return ret; @@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, int nested_svm_vmrun(struct vcpu_svm *svm) { int ret; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - u64 vmcb_gpa; + u64 vmcb12_gpa; if (is_smm(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } - vmcb_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + vmcb12_gpa = svm->vmcb->save.rax; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); return 1; @@ -471,26 +479,31 @@ int nested_svm_vmrun(struct vcpu_svm *svm) ret = kvm_skip_emulated_instruction(&svm->vcpu); - nested_vmcb = map.hva; + vmcb12 = map.hva; - if (!nested_vmcb_checks(svm, nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; + if (WARN_ON_ONCE(!svm->nested.initialized)) + return -EINVAL; + + if (!nested_vmcb_checks(svm, vmcb12)) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -522,7 +535,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -563,75 +576,77 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { int rc; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; /* Give the current vmcb to the guest */ - svm_set_gif(svm, false); - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); - nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); - nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; - - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - - if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, nested_vmcb); + vmcb12->save.es = vmcb->save.es; + vmcb12->save.cs = vmcb->save.cs; + vmcb12->save.ss = vmcb->save.ss; + vmcb12->save.ds = vmcb->save.ds; + vmcb12->save.gdtr = vmcb->save.gdtr; + vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.efer = svm->vcpu.arch.efer; + vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); + vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr4 = svm->vcpu.arch.cr4; + vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); + vmcb12->save.rip = kvm_rip_read(&svm->vcpu); + vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); + vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr6 = svm->vcpu.arch.dr6; + vmcb12->save.cpl = vmcb->save.cpl; + + vmcb12->control.int_state = vmcb->control.int_state; + vmcb12->control.exit_code = vmcb->control.exit_code; + vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + + if (vmcb12->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, vmcb12); if (svm->nrips_enabled) - nested_vmcb->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb->control.next_rip; - nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; - nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; - nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; - nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; + vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + vmcb12->control.event_inj = svm->nested.ctl.event_inj; + vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - nested_vmcb->control.pause_filter_count = + vmcb12->control.pause_filter_count = svm->vmcb->control.pause_filter_count; - nested_vmcb->control.pause_filter_thresh = + vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; /* Restore the original control entries */ copy_vmcb_control_area(&vmcb->control, &hsave->control); + /* On vmexit the GIF is set to false */ + svm_set_gif(svm, false); + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; @@ -657,11 +672,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); - trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, - nested_vmcb->control.exit_info_1, - nested_vmcb->control.exit_info_2, - nested_vmcb->control.exit_int_info, - nested_vmcb->control.exit_int_info_err, + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, + vmcb12->control.exit_info_1, + vmcb12->control.exit_info_2, + vmcb12->control.exit_int_info, + vmcb12->control.exit_int_info_err, KVM_ISA_SVM); kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -686,6 +701,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +int svm_allocate_nested(struct vcpu_svm *svm) +{ + struct page *hsave_page; + + if (svm->nested.initialized) + return 0; + + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!hsave_page) + return -ENOMEM; + svm->nested.hsave = page_address(hsave_page); + + svm->nested.msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->nested.msrpm) + goto err_free_hsave; + svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); + + svm->nested.initialized = true; + return 0; + +err_free_hsave: + __free_page(hsave_page); + return -ENOMEM; +} + +void svm_free_nested(struct vcpu_svm *svm) +{ + if (!svm->nested.initialized) + return; + + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + + __free_page(virt_to_page(svm->nested.hsave)); + svm->nested.hsave = NULL; + + svm->nested.initialized = false; +} + /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ @@ -700,6 +754,8 @@ void svm_leave_nested(struct vcpu_svm *svm) copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); } + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -707,7 +763,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -734,7 +790,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -765,14 +821,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.ctl.intercept_cr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.ctl.intercept_dr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -790,8 +844,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.ctl.intercept & exit_bits) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -831,7 +884,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; - return (svm->nested.ctl.intercept_exceptions & (1 << nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); } static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) @@ -899,7 +952,7 @@ static void nested_svm_intr(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static void nested_svm_init(struct vcpu_svm *svm) @@ -980,7 +1033,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) + if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1018,7 +1072,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { - kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1060,10 +1114,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; - struct vmcb_control_area ctl; - struct vmcb_save_area save; + struct vmcb_control_area *ctl; + struct vmcb_save_area *save; + int ret; u32 cr0; + BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > + KVM_STATE_NESTED_SVM_VMCB_SIZE); + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) return -EINVAL; @@ -1088,20 +1146,30 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { svm_leave_nested(svm); - goto out_set_gif; + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; } if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) return -EINVAL; if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) - return -EFAULT; - if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) - return -EFAULT; - if (!nested_vmcb_check_controls(&ctl)) - return -EINVAL; + ret = -ENOMEM; + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); + if (!ctl || !save) + goto out_free; + + ret = -EFAULT; + if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) + goto out_free; + if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) + goto out_free; + + ret = -EINVAL; + if (!nested_vmcb_check_controls(ctl)) + goto out_free; /* * Processor state contains L2 state. Check that it is @@ -1109,15 +1177,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) - return -EINVAL; + goto out_free; /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). * TODO: validate reserved bits for all saved state. */ - if (!(save.cr0 & X86_CR0_PG)) - return -EINVAL; + if (!(save->cr0 & X86_CR0_PG)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields @@ -1126,19 +1194,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * contains saved L1 state. */ copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = save; + hsave->save = *save; - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, &ctl); + svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; + load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); -out_set_gif: - svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - return 0; + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + ret = 0; +out_free: + kfree(save); + kfree(ctl); + + return ret; } struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402dc4234e39..c0b14106258a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -384,7 +384,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) uint8_t *page_virtual; unsigned long i; - if (npages == 0 || pages == NULL) + if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || + pages == NULL) return; for (i = 0; i < npages; i++) { @@ -446,10 +447,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* - * The LAUNCH_UPDATE command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in + * place; the cache may contain the data that was written unencrypted. */ sev_clflush_pages(inpages, npages); @@ -805,10 +804,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) } /* - * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify + * the pages; flush the destination too so that future accesses do not + * see stale data. */ sev_clflush_pages(src_p, 1); sev_clflush_pages(dst_p, 1); @@ -856,7 +854,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; - unsigned long n; + unsigned long n, i; int ret, offset; if (!sev_guest(kvm)) @@ -870,6 +868,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return PTR_ERR(pages); /* + * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in + * place; the cache may contain the data that was written unencrypted. + */ + sev_clflush_pages(pages, n); + + /* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command. */ @@ -914,6 +918,11 @@ e_free_blob: e_free: kfree(data); e_unpin_memory: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < n; i++) { + set_page_dirty_lock(pages[i]); + mark_page_accessed(pages[i]); + } sev_unpin_memory(kvm, pages, n); return ret; } @@ -1106,6 +1115,7 @@ void sev_vm_destroy(struct kvm *kvm) list_for_each_safe(pos, q, head) { __unregister_enc_region_locked(kvm, list_entry(pos, struct enc_region, list)); + cond_resched(); } } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336b64a4..2f32fd09e259 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -19,7 +19,7 @@ #include <linux/trace_events.h> #include <linux/slab.h> #include <linux/hashtable.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/psp-sev.h> #include <linux/file.h> #include <linux/pagemap.h> @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { +} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 @@ -263,9 +263,10 @@ static int get_max_npt_level(void) #endif } -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); + u64 old_efer = vcpu->arch.efer; vcpu->arch.efer = efer; if (!npt_enabled) { @@ -276,13 +277,32 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); - svm_set_gif(svm, true); + if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + + /* + * Free the nested guest state, unless we are in SMM. + * In this case we will return to the nested guest + * as soon as we leave SMM. + */ + if (!is_smm(&svm->vcpu)) + svm_free_nested(svm); + + } else { + int ret = svm_allocate_nested(svm); + + if (ret) { + vcpu->arch.efer = old_efer; + return ret; + } + } } svm->vmcb->save.efer = efer | EFER_SVME; vmcb_mark_dirty(svm->vmcb, VMCB_CR); + return 0; } static int is_external_interrupt(u32 info) @@ -553,18 +573,44 @@ free_cpu_data: } -static bool valid_msr_intercept(u32 index) +static int direct_access_msr_slot(u32 msr) { - int i; + u32 i; for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; + if (direct_access_msrs[i].index == msr) + return i; - return false; + return -ENOENT; +} + +static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, + int write) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int slot = direct_access_msr_slot(msr); + + if (slot == -ENOENT) + return; + + /* Set the shadow bitmaps to the desired intercept states */ + if (read) + set_bit(slot, svm->shadow_msr_intercept.read); + else + clear_bit(slot, svm->shadow_msr_intercept.read); + + if (write) + set_bit(slot, svm->shadow_msr_intercept.write); + else + clear_bit(slot, svm->shadow_msr_intercept.write); +} + +static bool valid_msr_intercept(u32 index) +{ + return direct_access_msr_slot(index) != -ENOENT; } -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { u8 bit_write; unsigned long tmp; @@ -583,8 +629,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) return !!test_bit(bit_write, &tmp); } -static void set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) +static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, + u32 msr, int read, int write) { u8 bit_read, bit_write; unsigned long tmp; @@ -596,6 +642,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, */ WARN_ON(!valid_msr_intercept(msr)); + /* Enforce non allowed MSRs to trap */ + if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + read = 0; + + if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + write = 0; + offset = svm_msrpm_offset(msr); bit_read = 2 * (msr & 0x0f); bit_write = 2 * (msr & 0x0f) + 1; @@ -609,17 +662,60 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, msrpm[offset] = tmp; } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { - int i; + set_shadow_msr_intercept(vcpu, msr, read, write); + set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); +} + +u32 *svm_vcpu_alloc_msrpm(void) +{ + struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + u32 *msrpm; + if (!pages) + return NULL; + + msrpm = page_address(pages); memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + return msrpm; +} + +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +{ + int i; + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { if (!direct_access_msrs[i].always) continue; + set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); + } +} + + +void svm_vcpu_free_msrpm(u32 *msrpm) +{ + __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); +} + +static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 i; + + /* + * Set intercept permissions for all direct access MSRs again. They + * will automatically get filtered through the MSR filter, so we are + * back in sync after this. + */ + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 msr = direct_access_msrs[i].index; + u32 read = test_bit(i, svm->shadow_msr_intercept.read); + u32 write = test_bit(i, svm->shadow_msr_intercept.write); - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); } } @@ -666,26 +762,26 @@ static void init_msrpm_offsets(void) } } -static void svm_enable_lbrv(struct vcpu_svm *svm) +static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -static void svm_disable_lbrv(struct vcpu_svm *svm) +static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -813,6 +909,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* Enable INVPCID feature */ + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) @@ -985,6 +1084,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } +static void svm_check_invpcid(struct vcpu_svm *svm) +{ + /* + * Intercept INVPCID instruction only if shadow page table is + * enabled. Interception is not required with nested page table + * enabled. + */ + if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { + if (!npt_enabled) + svm_set_intercept(svm, INTERCEPT_INVPCID); + else + svm_clr_intercept(svm, INTERCEPT_INVPCID); + } +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -992,14 +1106,14 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vcpu.arch.hflags = 0; - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR3_READ); + svm_set_intercept(svm, INTERCEPT_CR4_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR3_WRITE); + svm_set_intercept(svm, INTERCEPT_CR4_WRITE); if (!kvm_vcpu_apicv_active(&svm->vcpu)) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1094,15 +1208,15 @@ static void init_vmcb(struct vcpu_svm *svm) control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR3_READ); + svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } svm->asid_generation = 0; - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; if (!kvm_pause_in_guest(svm->vcpu.kvm)) { @@ -1114,6 +1228,8 @@ static void init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + svm_check_invpcid(svm); + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); @@ -1171,35 +1287,20 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *page; - struct page *msrpm_pages; - struct page *hsave_page; - struct page *nested_msrpm_pages; + struct page *vmcb_page; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); err = -ENOMEM; - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) + vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb_page) goto out; - msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) - goto free_page1; - - nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) - goto free_page2; - - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!hsave_page) - goto free_page3; - err = avic_init_vcpu(svm); if (err) - goto free_page4; + goto error_free_vmcb_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1207,18 +1308,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) svm->avic_is_running = true; - svm->nested.hsave = page_address(hsave_page); - clear_page(svm->nested.hsave); - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + svm->msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->msrpm) + goto error_free_vmcb_page; - svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb = page_address(page); - clear_page(svm->vmcb); - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); + svm->vmcb = page_address(vmcb_page); + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1227,14 +1324,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -free_page4: - __free_page(hsave_page); -free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); -free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); -free_page1: - __free_page(page); +error_free_vmcb_page: + __free_page(vmcb_page); out: return err; } @@ -1258,10 +1349,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) */ svm_clear_current_vmcb(svm->vmcb); + svm_free_nested(svm); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1549,11 +1640,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); } } @@ -2183,6 +2274,12 @@ static int iret_interception(struct vcpu_svm *svm) return 1; } +static int invd_interception(struct vcpu_svm *svm) +{ + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(&svm->vcpu); +} + static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) @@ -2218,12 +2315,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, { unsigned long cr0 = svm->vcpu.arch.cr0; bool ret = false; - u64 intercept; - - intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || - (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2261,6 +2355,7 @@ static int cr_interception(struct vcpu_svm *svm) if (cr >= 16) { /* mov to cr */ cr -= 16; val = kvm_register_read(&svm->vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: if (!check_selective_cr0_intercepted(svm, val)) @@ -2306,6 +2401,7 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } kvm_register_write(&svm->vcpu, reg, val); + trace_kvm_cr_read(cr, val); } return kvm_complete_insn_gp(&svm->vcpu, err); } @@ -2556,7 +2652,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && @@ -2571,7 +2667,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -2635,9 +2731,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.dbgctl = data; vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) - svm_enable_lbrv(svm); + svm_enable_lbrv(vcpu); else - svm_disable_lbrv(svm); + svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: svm->nested.hsave_msr = data; @@ -2733,6 +2829,33 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +static int invpcid_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long type; + gva_t gva; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* + * For an INVPCID intercept: + * EXITINFO1 provides the linear address of the memory operand. + * EXITINFO2 provides the contents of the register operand. + */ + type = svm->vmcb->control.exit_info_2; + gva = svm->vmcb->control.exit_info_1; + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_handle_invpcid(vcpu, type, gva); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -2774,7 +2897,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_RDPMC] = rdpmc_interception, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, - [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_INVD] = invd_interception, [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -2795,6 +2918,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -2813,12 +2937,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) } pr_err("VMCB Control Area:\n"); - pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); - pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); - pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); - pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); - pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); - pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); + pr_err("%-20s%08x %08x\n", "intercepts:", + control->intercepts[INTERCEPT_WORD3], + control->intercepts[INTERCEPT_WORD4]); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); pr_err("%-20s%d\n", "pause filter threshold:", control->pause_filter_thresh); @@ -2917,12 +3043,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; *info1 = control->exit_info_1; *info2 = control->exit_info_2; + *intr_info = control->exit_int_info; + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->exit_int_info_err; + else + *error_code = 0; } static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) @@ -2933,22 +3066,15 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); + trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3058,13 +3184,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (nested_svm_virtualize_tpr(vcpu)) return; - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); if (irr == -1) return; if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } bool svm_nmi_blocked(struct kvm_vcpu *vcpu) @@ -3252,7 +3378,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (nested_svm_virtualize_tpr(vcpu)) return; - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); } @@ -3349,8 +3475,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3415,7 +3540,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3456,9 +3580,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -3504,7 +3626,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3518,6 +3639,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) @@ -3537,8 +3659,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - vmcb_mark_all_clean(svm->vmcb); - return exit_fastpath; + svm_complete_interrupts(svm); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + return svm_exit_handlers_fastpath(vcpu); } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, @@ -3624,6 +3750,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + /* Check again if INVPCID interception if required */ + svm_check_invpcid(svm); + if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -3738,7 +3867,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - u64 intercept; if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; @@ -3747,9 +3875,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.ctl.intercept; - - if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + if (!(vmcb_is_intercept(&svm->nested.ctl, + INTERCEPT_SELECTIVE_CR0))) break; cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; @@ -3884,7 +4011,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) /* FED8h - SVM Guest */ put_smstate(u64, smstate, 0x7ed8, 1); /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3900,21 +4027,31 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *nested_vmcb; struct kvm_host_map map; - u64 guest; - u64 vmcb; int ret = 0; - guest = GET_SMSTATE(u64, smstate, 0x7ed8); - vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); + u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (guest) { - if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) - return 1; - nested_vmcb = map.hva; - ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + if (guest) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; + + if (!(saved_efer & EFER_SVME)) + return 1; + + if (kvm_vcpu_map(&svm->vcpu, + gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; + + if (svm_allocate_nested(svm)) + return 1; + + ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); + kvm_vcpu_unmap(&svm->vcpu, &map, true); + } } return ret; @@ -3933,19 +4070,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { - unsigned long cr4 = kvm_read_cr4(vcpu); - bool smep = cr4 & X86_CR4_SMEP; - bool smap = cr4 & X86_CR4_SMAP; - bool is_user = svm_get_cpl(vcpu) == 3; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + bool smep, smap, is_user; + unsigned long cr4; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -3987,6 +4115,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ + if (likely(!insn || insn_len)) + return true; + + /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + cr4 = kvm_read_cr4(vcpu); + smep = cr4 & X86_CR4_SMEP; + smap = cr4 & X86_CR4_SMAP; + is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; @@ -4010,7 +4152,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) * if an INIT signal is pending. */ return !gif_set(svm) || - (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); + (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } static void svm_vm_destroy(struct kvm *kvm) @@ -4148,9 +4290,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .msr_filter_changed = svm_msr_filter_changed, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { @@ -4164,6 +4308,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + __unused_size_checks(); + return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), THIS_MODULE); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a798e1731709..1d853fe4c778 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define MAX_DIRECT_ACCESS_MSRS 15 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -85,8 +86,7 @@ struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; - u64 vmcb; - u32 host_intercept_exceptions; + u64 vmcb12_gpa; /* These are the merged vectors */ u32 *msrpm; @@ -97,6 +97,8 @@ struct svm_nested_state { /* cache for control fields of the guest */ struct vmcb_control_area ctl; + + bool initialized; }; struct vcpu_svm { @@ -158,6 +160,12 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* Save desired MSR intercept (read: pass-through) state */ + struct { + DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); + DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); + } shadow_msr_intercept; }; struct svm_cpu_data { @@ -214,51 +222,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) return svm->vmcb; } -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __set_bit(bit, (unsigned long *)&control->intercepts); } -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __clear_bit(bit, (unsigned long *)&control->intercepts); } -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); } static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) - | (1 << INTERCEPT_DR1_READ) - | (1 << INTERCEPT_DR2_READ) - | (1 << INTERCEPT_DR3_READ) - | (1 << INTERCEPT_DR4_READ) - | (1 << INTERCEPT_DR5_READ) - | (1 << INTERCEPT_DR6_READ) - | (1 << INTERCEPT_DR7_READ) - | (1 << INTERCEPT_DR0_WRITE) - | (1 << INTERCEPT_DR1_WRITE) - | (1 << INTERCEPT_DR2_WRITE) - | (1 << INTERCEPT_DR3_WRITE) - | (1 << INTERCEPT_DR4_WRITE) - | (1 << INTERCEPT_DR5_WRITE) - | (1 << INTERCEPT_DR6_WRITE) - | (1 << INTERCEPT_DR7_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } @@ -267,25 +268,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = 0; + vmcb->control.intercepts[INTERCEPT_DR] = 0; recalc_intercepts(svm); } -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions |= (1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions &= ~(1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } @@ -294,7 +297,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept |= (1ULL << bit); + vmcb_set_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -303,14 +306,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept &= ~(1ULL << bit); + vmcb_clr_intercept(&vmcb->control, bit); recalc_intercepts(svm); } static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { - return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; + return vmcb_is_intercept(&svm->vmcb->control, bit); } static inline bool vgif_enabled(struct vcpu_svm *svm) @@ -345,11 +348,15 @@ static inline bool gif_set(struct vcpu_svm *svm) /* svm.c */ #define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U #define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); +u32 *svm_vcpu_alloc_msrpm(void); +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); +void svm_vcpu_free_msrpm(u32 *msrpm); + +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -374,22 +381,24 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); +void svm_free_nested(struct vcpu_svm *svm); +int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b66432b015d2..aef960f90f26 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,18 +15,20 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ), TP_fast_assign( - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ), - TP_printk("vcpu %u", __entry->vcpu_id) + TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ); /* @@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic, (isa == KVM_ISA_VMX) ? \ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" +#define TRACE_EVENT_KVM_EXIT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(exit_reason, vcpu, isa), \ + \ + TP_STRUCT__entry( \ + __field( unsigned int, exit_reason ) \ + __field( unsigned long, guest_rip ) \ + __field( u32, isa ) \ + __field( u64, info1 ) \ + __field( u64, info2 ) \ + __field( u32, intr_info ) \ + __field( u32, error_code ) \ + __field( unsigned int, vcpu_id ) \ + ), \ + \ + TP_fast_assign( \ + __entry->exit_reason = exit_reason; \ + __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->isa = isa; \ + __entry->vcpu_id = vcpu->vcpu_id; \ + kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ + ), \ + \ + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + __entry->vcpu_id, \ + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ + __entry->guest_rip, __entry->info1, __entry->info2, \ + __entry->intr_info, __entry->error_code) \ +) + /* * Tracepoint for kvm guest exit: */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - __entry->vcpu_id = vcpu->vcpu_id; - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", - __entry->vcpu_id, - kvm_print_exit_reason(__entry->exit_reason, __entry->isa), - __entry->guest_rip, __entry->info1, __entry->info2) -); +TRACE_EVENT_KVM_EXIT(kvm_exit); /* * Tracepoint for kvm interrupt injection: @@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun, ); TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, + __u32 intercept1, __u32 intercept2, __u32 intercept3), + TP_ARGS(cr_read, cr_write, exceptions, intercept1, + intercept2, intercept3), TP_STRUCT__entry( __field( __u16, cr_read ) __field( __u16, cr_write ) __field( __u32, exceptions ) - __field( __u64, intercept ) + __field( __u32, intercept1 ) + __field( __u32, intercept2 ) + __field( __u32, intercept3 ) ), TP_fast_assign( __entry->cr_read = cr_read; __entry->cr_write = cr_write; __entry->exceptions = exceptions; - __entry->intercept = intercept; + __entry->intercept1 = intercept1; + __entry->intercept2 = intercept2; + __entry->intercept3 = intercept3; ), - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) + TP_printk("cr_read: %04x cr_write: %04x excp: %08x " + "intercepts: %08x %08x %08x", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept1, __entry->intercept2, __entry->intercept3) ); /* * Tracepoint for #VMEXIT while nested */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - kvm_print_exit_reason(__entry->exit_code, __entry->isa), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); +TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); /* * Tracepoint for #VMEXIT reinjected to the guest diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4bbd8b448d22..3a1861403d73 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void) static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_ENABLE_RDTSCP; } static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) @@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vmx_rdrand_supported(void) +static inline bool cpu_has_vmx_rdrand(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDRAND_EXITING; @@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void) SECONDARY_EXEC_ENCLS_EXITING; } -static inline bool vmx_rdseed_supported(void) +static inline bool cpu_has_vmx_rdseed(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDSEED_EXITING; @@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool vmx_xsaves_supported(void) +static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_XSAVES; } -static inline bool vmx_waitpkg_supported(void) +static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23b58c28a1c9..89af692deb7e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/percpu.h> #include <asm/debugreg.h> @@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs = NULL; } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; + int cpu; + + if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) + return; + + cpu = get_cpu(); + prev = vmx->loaded_vmcs; + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_sync_vmcs_host_state(vmx, prev); + put_cpu(); + + vmx_register_cache_reset(vcpu); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; @@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } -static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, - struct loaded_vmcs *prev) -{ - struct vmcs_host_state *dest, *src; - - if (unlikely(!vmx->guest_state_loaded)) - return; - - src = &prev->host_state; - dest = &vmx->loaded_vmcs->host_state; - - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); - dest->ldt_sel = src->ldt_sel; -#ifdef CONFIG_X86_64 - dest->ds_sel = src->ds_sel; - dest->es_sel = src->es_sel; -#endif -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *prev; - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - prev = vmx->loaded_vmcs; - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); - vmx_sync_vmcs_host_state(vmx, prev); - put_cpu(); - - vmx_register_cache_reset(vcpu); -} - /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). @@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); vcpu_put(vcpu); } @@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int index = vmx_find_msr_index(&vmx->msr_autostore.guest, - MSR_IA32_TSC); + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + MSR_IA32_TSC); - if (index >= 0) { - u64 val = vmx->msr_autostore.guest.val[index].value; + if (i >= 0) { + u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; - int msr_autostore_index; + int msr_autostore_slot; bool in_autostore_list; int last; - msr_autostore_index = vmx_find_msr_index(autostore, msr_index); - in_autostore_list = msr_autostore_index >= 0; + msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); + in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == NR_LOADSTORE_MSRS) { + if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by @@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; - autostore->val[msr_autostore_index] = autostore->val[last]; + autostore->val[msr_autostore_slot] = autostore->val[last]; } } @@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + secondary_exec_controls_set(vmx, exec_control); } @@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmx->segment_cache.bitmask = 0; } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. */ - if (vmx->emulation_required) { + if (CC(!vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - if (unlikely(!nested_get_vmcs12_pages(vcpu))) + if (unlikely(!nested_get_vmcs12_pages(vcpu))) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; + } if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); @@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* @@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { return nested_vmx_failInvalid(vcpu); } - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ - if (vmcs12->hdr.shadow_vmcs) + if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (vmx->nested.hv_evmcs) { @@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - if (vmcs12->launch_state == launch) + if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); @@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; + /* Emulate processing of posted interrupts on VM-Enter. */ + if (nested_cpu_has_posted_intr(vmcs12) && + kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); + } + /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { - struct shared_msr_entry *efer_msr; + struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) @@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmx->msr_autoload.guest.val[i].value; } - efer_msr = find_msr_entry(vmx, MSR_EFER); + efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; @@ -4404,6 +4420,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); + /* + * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between + * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are + * up-to-date before switching to L1. + */ + if (enable_ept && is_pae_paging(vcpu)) + vmx_ept_load_pdptrs(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) @@ -4668,7 +4692,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; } } @@ -4688,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { - *ret = vmx_handle_memory_failure(vcpu, r, &e); + *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } @@ -4752,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); } return 0; @@ -4995,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); @@ -5068,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); @@ -5230,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } @@ -5283,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, @@ -5365,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, @@ -5910,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - exit_intr_info = vmx_get_intr_info(vcpu); - exit_qual = vmx_get_exit_qual(vcpu); - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, - vmx->idt_vectoring_info, exit_intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -5932,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + exit_intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } + exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); @@ -6174,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; } @@ -6310,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | @@ -6329,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); @@ -6383,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6553,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = nested_get_vmcs12_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c new file mode 100644 index 000000000000..f02962dcc72c --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> + +#include <asm/irq_remapping.h> +#include <asm/cpu.h> + +#include "lapic.h" +#include "posted_intr.h" +#include "trace.h" +#include "vmx.h" + +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + + /* The full case. */ + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + new.sn = 0; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + +after_clear_sn: + + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + if (!pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); +} + +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +int pi_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return 0; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } + + do { + old.control = new.control = pi_desc->control; + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); +} + +void pi_post_block(struct kvm_vcpu *vcpu) +{ + if (vcpu->pre_pcpu == -1) + return; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +void pi_wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +void __init pi_init_cpu(int cpu) +{ + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); +} + + +/* + * pi_update_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h new file mode 100644 index 000000000000..0bdc41391c5b --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_POSTED_INTR_H +#define __KVM_X86_VMX_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); +int pi_pre_block(struct kvm_vcpu *vcpu); +void pi_post_block(struct kvm_vcpu *vcpu); +void pi_wakeup_handler(void); +void __init pi_init_cpu(int cpu); +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set); + +#endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7a3675fddec2..1472c6c376f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); } +static inline bool is_exception_with_error_code(u32 intr_info) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK; + + return (intr_info & mask) == mask; +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 799db084a336..90ad7a6246e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -4,6 +4,7 @@ #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> +#include <asm/segment.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -294,3 +295,36 @@ SYM_FUNC_START(vmread_error_trampoline) ret SYM_FUNC_END(vmread_error_trampoline) + +SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + CALL_NOSPEC _ASM_ARG1 + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + ret +SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 819c185adf09..d14c94d0aff1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -13,7 +13,6 @@ * Yaniv Kamay <yaniv@qumranet.com> */ -#include <linux/frame.h> #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/kernel.h> @@ -22,6 +21,7 @@ #include <linux/moduleparam.h> #include <linux/mod_devicetable.h> #include <linux/mm.h> +#include <linux/objtool.h> #include <linux/sched.h> #include <linux/sched/smt.h> #include <linux/slab.h> @@ -56,7 +56,6 @@ #include "lapic.h" #include "mmu.h" #include "nested.h" -#include "ops.h" #include "pmu.h" #include "trace.h" #include "vmcs.h" @@ -129,6 +128,9 @@ static bool __read_mostly enable_preemption_timer = 1; module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif +extern bool __read_mostly allow_smaller_maxphyaddr; +module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); + #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ @@ -146,8 +148,25 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) -#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ - (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) +/* + * List of MSRs that can be directly passed to the guest. + * In addition to these x2apic and PT MSRs are handled specially. + */ +static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_TSC, + MSR_FS_BASE, + MSR_GS_BASE, + MSR_KERNEL_GS_BASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_CORE_C1_RES, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, +}; /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: @@ -341,9 +360,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_vmexit(void); @@ -398,13 +416,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs); */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -447,9 +458,9 @@ static unsigned long host_idt_base; * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, IA32_STAR must always be included in - * vmx_msr_index[], even in i386 builds. + * vmx_uret_msrs_list[], even in i386 builds. */ -const u32 vmx_msr_index[] = { +static const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif @@ -623,36 +634,71 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +static int possible_passthrough_msr_slot(u32 msr) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + + return -ENOENT; +} + +static bool is_valid_passthrough_msr(u32 msr) +{ + bool r; + + switch (msr) { + case 0x800 ... 0x8ff: + /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ + return true; + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + return true; + } + + r = possible_passthrough_msr_slot(msr) != -ENOENT; + + WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + + return r; +} + +static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + for (i = 0; i < vmx->nr_uret_msrs; ++i) + if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) return i; return -1; } -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __find_msr_index(vmx, msr); + i = __vmx_find_uret_msr(vmx, msr); if (i >= 0) - return &vmx->guest_msrs[i]; + return &vmx->guest_uret_msrs[i]; return NULL; } -static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, + struct vmx_uret_msr *msr, u64 data) { int ret = 0; u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -791,6 +837,18 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; + else { + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in + * prepare_vmcs02_rare. + */ + bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR)); + int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0; + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask); + } vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -825,7 +883,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -859,7 +917,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -867,7 +925,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = vmx_find_msr_index(&m->host, msr); + i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; @@ -926,12 +984,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) - j = vmx_find_msr_index(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { + if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -954,10 +1012,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host.val[j].value = host_val; } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; + int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) @@ -989,17 +1048,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) else clear_atomic_switch_msr(vmx, MSR_EFER); return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); + } + + i = __vmx_find_uret_msr(vmx, MSR_EFER); + if (i < 0) + return false; - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + clear_atomic_switch_msr(vmx, MSR_EFER); - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; - return true; - } + vmx->guest_uret_msrs[i].data = guest_efer; + vmx->guest_uret_msrs[i].mask = ~ignore_bits; + + return true; } #ifdef CONFIG_X86_32 @@ -1037,6 +1100,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } +static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) +{ + /* The base must be 128-byte aligned and a legal physical address. */ + return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -1141,12 +1210,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->guest_msrs_ready) { - vmx->guest_msrs_ready = true; - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); + if (!vmx->guest_uret_msrs_loaded) { + vmx->guest_uret_msrs_loaded = true; + for (i = 0; i < vmx->nr_active_uret_msrs; ++i) + kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, + vmx->guest_uret_msrs[i].data, + vmx->guest_uret_msrs[i].mask); } @@ -1230,7 +1299,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; - vmx->guest_msrs_ready = false; + vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 @@ -1253,62 +1322,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - -after_clear_sn: - - /* - * Clear SN before reading the bitmap. The VT-d firmware - * writes the bitmap and reads SN atomically (5.2.3 in the - * spec), so it doesn't really have a memory barrier that - * pairs with this, but we cannot do that and we need one. - */ - smp_mb__after_atomic(); - - if (!pi_is_pir_empty(pi_desc)) - pi_set_on(pi_desc); -} - void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1392,20 +1405,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1415,7 +1414,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static bool emulation_required(struct kvm_vcpu *vcpu) { - return emulate_invalid_guest_state && !guest_state_valid(vcpu); + return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -1441,7 +1440,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; - if (enable_unrestricted_guest) { + if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); @@ -1561,6 +1560,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +{ + return true; +} + static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip, orig_rip; @@ -1599,33 +1603,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } /* - * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns - * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value - * indicates whether exit to userspace is needed. - */ -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e) -{ - if (r == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_emulated_page_fault(vcpu, e); - return 1; - } - - /* - * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED - * while handling a VMX instruction KVM could've handled the request - * correctly by exiting to userspace and performing I/O but there - * doesn't seem to be a real use-case behind such requests, just return - * KVM_EXIT_INTERNAL_ERROR for now. - */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - - return 0; -} - -/* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ @@ -1708,16 +1685,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) { - struct shared_msr_entry tmp; + struct vmx_uret_msr tmp; + int from, to; + + from = __vmx_find_uret_msr(vmx, msr); + if (from < 0) + return; + to = vmx->nr_active_uret_msrs++; - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; + tmp = vmx->guest_uret_msrs[to]; + vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; + vmx->guest_uret_msrs[from] = tmp; } /* @@ -1727,38 +1707,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; - - save_nmsrs = 0; + vmx->guest_uret_msrs_loaded = false; + vmx->nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - index = __find_msr_index(vmx, MSR_STAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + vmx_setup_uret_msr(vmx, MSR_STAR); + vmx_setup_uret_msr(vmx, MSR_LSTAR); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); } #endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_ready = false; + if (update_transition_efer(vmx)) + vmx_setup_uret_msr(vmx, MSR_EFER); + + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + vmx_setup_uret_msr(vmx, MSR_TSC_AUX); + + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -1828,7 +1796,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { @@ -1849,7 +1817,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1956,10 +1924,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_info->index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; @@ -1988,7 +1956,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; @@ -2082,7 +2050,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; @@ -2092,7 +2060,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2118,8 +2086,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) @@ -2169,7 +2136,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) @@ -2194,7 +2161,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; - if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) + if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; @@ -2229,13 +2196,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_index); if (msr) - ret = vmx_set_guest_msr(vmx, msr, data); + ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } @@ -2267,7 +2234,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + if (is_unrestricted_guest(vcpu) || + (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2448,7 +2416,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2862,13 +2830,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); + /* Nothing to do if hardware doesn't support EFER. */ if (!msr) - return; + return 0; vcpu->arch.efer = efer; if (efer & EFER_LMA) { @@ -2880,6 +2849,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } setup_msrs(vmx); + return 0; } #ifdef CONFIG_X86_64 @@ -2971,7 +2941,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) vpid_sync_context(to_vmx(vcpu)->vpid); } -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3033,7 +3003,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) unsigned long hw_cr0; hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3054,7 +3024,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !enable_unrestricted_guest) + if (enable_ept && !is_unrestricted_guest(vcpu)) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -3114,7 +3084,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; - ept_load_pdptrs(vcpu); + vmx_ept_load_pdptrs(vcpu); } else { guest_cr3 = pgd; } @@ -3134,7 +3104,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long hw_cr4; hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3169,7 +3139,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!enable_unrestricted_guest) { + if (!is_unrestricted_guest(vcpu)) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; @@ -3309,7 +3279,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); @@ -3498,11 +3468,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) * not. * We assume that registers are always usable */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { - if (enable_unrestricted_guest) - return true; - /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) @@ -3688,11 +3655,52 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -3700,36 +3708,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filters change. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); + if ((type & MSR_TYPE_R) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { + vmx_set_msr_bitmap_read(msr_bitmap, msr); + type &= ~MSR_TYPE_R; + } - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); + if ((type & MSR_TYPE_W) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { + vmx_set_msr_bitmap_write(msr_bitmap, msr); + type &= ~MSR_TYPE_W; + } - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { - int f = sizeof(unsigned long); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; @@ -3738,39 +3754,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filter changes. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_set_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) +static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value) { if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + vmx_enable_intercept_for_msr(vcpu, msr, type); else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); + vmx_disable_intercept_for_msr(vcpu, msr, type); } static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) @@ -3788,35 +3799,47 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) +static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { + unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + unsigned long read_intercept; int msr; + read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + unsigned int read_idx = msr / BITS_PER_LONG; + unsigned int write_idx = read_idx + (0x800 / sizeof(long)); + + msr_bitmap[read_idx] = read_intercept; + msr_bitmap[write_idx] = ~0ul; } +} - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +{ + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_reset_x2apic_msrs(vcpu, mode); + + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_X2APIC)); + + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; u8 mode = vmx_msr_bitmap_mode(vcpu); u8 changed = mode ^ vmx->msr_bitmap_mode; @@ -3824,30 +3847,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) return; if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + vmx_update_msr_bitmap_x2apic(vcpu, mode); vmx->msr_bitmap_mode = mode; } -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, - MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.addr_range; i++) { - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } @@ -3871,6 +3888,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) return ((rvi & 0xf0) > (vppr & 0xf0)); } +static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 i; + + /* + * Set intercept permissions for all potentially passed through MSRs + * again. They will automatically get filtered through the MSR filter, + * so we are back in sync after this. + */ + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + u32 msr = vmx_possible_passthrough_msrs[i]; + bool read = test_bit(i, vmx->shadow_msr_intercept.read); + bool write = test_bit(i, vmx->shadow_msr_intercept.write); + + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + } + + pt_update_intercept_for_msr(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -4028,13 +4068,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + struct kvm_vcpu *vcpu = &vmx->vcpu; + + vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & + ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vcpu->arch.cr4_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) @@ -4099,6 +4142,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +/* + * Adjust a single secondary execution control bit to intercept/allow an + * instruction in the guest. This is usually done based on whether or not a + * feature has been exposed to the guest in order to correctly emulate faults. + */ +static inline void +vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, + u32 control, bool enabled, bool exiting) +{ + /* + * If the control is for an opt-in feature, clear the control if the + * feature is not exposed to the guest, i.e. not enabled. If the + * control is opt-out, i.e. an exiting control, clear the control if + * the feature _is_ exposed to the guest, i.e. exiting/interception is + * disabled for the associated instruction. Note, the caller is + * responsible presetting exec_control to set all supported bits. + */ + if (enabled == exiting) + *exec_control &= ~control; + + /* + * Update the nested MSR settings so that a nested VMM can/can't set + * controls for features that are/aren't exposed to the guest. + */ + if (nested) { + if (enabled) + vmx->nested.msrs.secondary_ctls_high |= control; + else + vmx->nested.msrs.secondary_ctls_high &= ~control; + } +} + +/* + * Wrapper macro for the common case of adjusting a secondary execution control + * based on a single guest CPUID bit, with a dedicated feature bit. This also + * verifies that the control is actually supported by KVM and hardware. + */ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + __enabled = guest_cpuid_has(&(vmx)->vcpu, \ + X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, \ + SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ + } \ +}) + +/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ +#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) + +#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { @@ -4139,7 +4237,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (vmx_xsaves_supported()) { + if (cpu_has_vmx_xsaves()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4148,101 +4246,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vcpu->arch.xsaves_enabled = xsaves_enabled; - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_XSAVES, + xsaves_enabled, false); } - if (cpu_has_vmx_rdtscp()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - if (cpu_has_vmx_invpcid()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); - - if (!invpcid_enabled) { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } - - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (vmx_rdrand_supported()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } - } - - if (vmx_rdseed_supported()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; - - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } + /* + * Expose INVPCID if and only if PCID is also exposed to the guest. + * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF + * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect + * behavior from the guest perspective (it would expect #GP or #PF). + */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - if (vmx_waitpkg_supported()) { - bool waitpkg_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); - if (!waitpkg_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); - if (nested) { - if (waitpkg_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - } - } + vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, + ENABLE_USR_WAIT_PAUSE, false); vmx->secondary_exec_control = exec_control; } @@ -4335,7 +4361,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - if (vmx_xsaves_supported()) + if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { @@ -4352,16 +4378,6 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } - - /* - * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched - * between guest and host. In that case we only care about present - * faults. - */ - if (enable_ept) { - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); - } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4803,6 +4819,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * EPT will cause page fault only if we need to * detect illegal GPAs. */ + WARN_ON_ONCE(!allow_smaller_maxphyaddr); kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); return 1; } else @@ -5148,7 +5165,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0); + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(vcpu); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -5331,7 +5349,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -5442,25 +5460,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -5524,16 +5523,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; - bool pcid_enabled; gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; } operand; - int r; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5556,68 +5550,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) sizeof(operand), &gva)) return 1; - r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); - if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); - - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: - /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. - */ - - fallthrough; - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ - } + return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -5746,10 +5679,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + *info1 = vmx_get_exit_qual(vcpu); - *info2 = vmx_get_intr_info(vcpu); + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + *info2 = vmx->idt_vectoring_info; + *intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + else + *error_code = 0; + } else { + *info2 = 0; + *intr_info = 0; + *error_code = 0; + } } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -6054,6 +6001,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -6382,14 +6330,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - return pi_test_on(pi_desc) || - (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); -} - static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -6409,70 +6349,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } +void vmx_do_interrupt_nmi_irqoff(unsigned long entry); + +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +{ + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; + + kvm_before_interrupt(vcpu); + vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(intr_info)) { + if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(intr_info)) { + else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } + else if (is_nmi(intr_info)) + handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { - unsigned int vector; - unsigned long entry; -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif - gate_desc *desc; u32 intr_info = vmx_get_intr_info(vcpu); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - vector = intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)host_idt_base + vector; - entry = gate_offset(desc); - - kvm_before_interrupt(vcpu); - - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%rsp, %[sp]\n\t" - "and $-16, %%rsp\n\t" - "push %[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - "push %[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - [thunk_target]"r"(entry), -#ifdef CONFIG_X86_64 - [ss]"i"(__KERNEL_DS), -#endif - [cs]"i"(__KERNEL_CS) - ); - - kvm_after_interrupt(vcpu); + handle_interrupt_nmi_irqoff(vcpu, intr_info); } -STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { @@ -6799,9 +6712,7 @@ reenter_guest: if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -6945,20 +6856,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { + u32 index = vmx_uret_msrs_list[i]; u32 data_low, data_high; - int j = vmx->nmsrs; + int j = vmx->nr_uret_msrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; + vmx->guest_uret_msrs[j].slot = i; + vmx->guest_uret_msrs[j].data = 0; switch (index) { case MSR_IA32_TSX_CTRL: /* @@ -6966,32 +6877,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) * let's avoid changing CPUID bits under the host * kernel's feet. */ - vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; break; default: - vmx->guest_msrs[j].mask = -1ull; + vmx->guest_uret_msrs[j].mask = -1ull; break; } - ++vmx->nmsrs; + ++vmx->nr_uret_msrs; } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml; + /* The MSR bitmap starts with all ones */ + bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } vmx->msr_bitmap_mode = 0; @@ -7015,8 +6930,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept); + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -7336,13 +7250,18 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { - struct shared_msr_entry *msr; - msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + struct vmx_uret_msr *msr; + msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); - vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } + + set_cr4_guest_host_mask(vmx); + + /* Refresh #PF interception to account for MAXPHYADDR changes. */ + update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) @@ -7366,14 +7285,14 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0xD.1 */ supported_xss = 0; - if (!vmx_xsaves_supported()) + if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); - if (vmx_waitpkg_supported()) + if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } @@ -7429,7 +7348,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * Because it is marked as EmulateOnUD, we need to intercept it here. */ case x86_intercept_rdtscp: - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; @@ -7561,107 +7480,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } -} - -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); - - local_irq_enable(); - return (vcpu->pre_pcpu == -1); -} - static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7673,17 +7491,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - if (vcpu->pre_pcpu == -1) - return; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); -} - static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) @@ -7692,100 +7499,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) pi_post_block(vcpu); } -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7843,11 +7556,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) /* RSM will cause a vmexit anyway. */ } -static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) -{ - return false; -} - static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon; @@ -7954,7 +7662,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, @@ -7988,7 +7696,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = vmx_update_pi_irte, + .update_pi_irte = pi_update_irte, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8002,9 +7710,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, }; static __init int hardware_setup(void) @@ -8016,8 +7726,8 @@ static __init int hardware_setup(void) store_idt(&dt); host_idt_base = dt.address; - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; @@ -8154,7 +7864,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -8293,8 +8003,8 @@ static int __init vmx_init(void) for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + + pi_init_cpu(cpu); } #ifdef CONFIG_KEXEC_CORE @@ -8304,11 +8014,12 @@ static int __init vmx_init(void) vmx_check_vmcs12_offsets(); /* - * Intel processors don't have problems with - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable - * it for VMX by default + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default */ - allow_smaller_maxphyaddr = true; + if (!enable_ept) + allow_smaller_maxphyaddr = true; return 0; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 26175a4759fa..f6f66e5c6510 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -9,8 +9,9 @@ #include "capabilities.h" #include "kvm_cache_regs.h" -#include "ops.h" +#include "posted_intr.h" #include "vmcs.h" +#include "vmx_ops.h" #include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -22,20 +23,20 @@ extern const u32 vmx_msr_index[]; #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 -#define NR_SHARED_MSRS 7 +#define MAX_NR_USER_RETURN_MSRS 7 #else -#define NR_SHARED_MSRS 4 +#define MAX_NR_USER_RETURN_MSRS 4 #endif -#define NR_LOADSTORE_MSRS 8 +#define MAX_NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; + struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; -struct shared_msr_entry { - unsigned index; +struct vmx_uret_msr { + unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ u64 data; u64 mask; }; @@ -49,29 +50,6 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - #define RTIT_ADDR_RANGE 4 struct pt_ctx { @@ -218,10 +196,10 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry guest_msrs[NR_SHARED_MSRS]; - int nmsrs; - int save_nmsrs; - bool guest_msrs_ready; + struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; + int nr_uret_msrs; + int nr_active_uret_msrs; + bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -301,6 +279,13 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + + /* Save desired MSR intercept (read: pass-through) state */ +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 + struct { + DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + } shadow_msr_intercept; }; enum ept_pointers_status { @@ -334,7 +319,7 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); @@ -343,6 +328,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); + void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -350,73 +336,11 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e); - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); static inline u8 vmx_get_rvi(void) { @@ -498,11 +422,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -551,7 +470,23 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { - return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + if (!enable_ept) + return true; + + return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; +} + +static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) +{ + return enable_unrestricted_guest && (!is_guest_mode(vcpu) || + (secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_UNRESTRICTED_GUEST)); +} + +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); +static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); } void dump_vmcs(void); diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 75270229a8bf..397f599b20e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,6 +71,7 @@ #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <clocksource/hyperv_timer.h> @@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -#define KVM_NR_SHARED_MSRS 16 +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_shared_msrs_global { +struct kvm_user_return_msrs_global { int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; }; -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; +static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -188,7 +194,7 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); -bool __read_mostly allow_smaller_maxphyaddr; +bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); static u64 __read_mostly host_xss; @@ -266,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, } else { vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return 1; + return -ENOENT; } } @@ -293,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -303,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn) * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(user_return_msrs_global.msrs[slot], values->host); values->curr = values->host; } } } -void kvm_define_shared_msr(unsigned slot, u32 msr) +void kvm_define_user_return_msr(unsigned slot, u32 msr) { - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; + BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); + user_return_msrs_global.msrs[slot] = msr; + if (slot >= user_return_msrs_global.nr) + user_return_msrs_global.nr = slot + 1; } -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); +EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); -static void kvm_shared_msr_cpu_online(void) +static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; - for (i = 0; i < shared_msrs_global.nr; ++i) { - rdmsrl_safe(shared_msrs_global.msrs[i], &value); - smsr->values[i].host = value; - smsr->values[i].curr = value; + for (i = 0; i < user_return_msrs_global.nr; ++i) { + rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } } -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -976,6 +982,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP; + unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; if (kvm_valid_cr4(vcpu, cr4)) return 1; @@ -1003,7 +1010,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (kvm_x86_ops.set_cr4(vcpu, cr4)) return 1; - if (((cr4 ^ old_cr4) & pdptr_bits) || + if (((cr4 ^ old_cr4) & mmu_role_bits) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); @@ -1451,6 +1458,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1467,7 +1475,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops.set_efer(vcpu, efer); + r = kvm_x86_ops.set_efer(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1482,6 +1494,40 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up or x2APIC enabled, allow everything */ + if (!count || (index >= 0x800 && index <= 0x8ff)) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1493,6 +1539,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1549,6 +1598,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1584,12 +1636,91 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1607,8 +1738,21 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) + /* Bounce to user space */ + return 0; + + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + + /* MSR write failed? Inject a #GP */ + if (r > 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -1774,12 +1918,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; @@ -1787,6 +1925,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) struct pvclock_wall_clock wc; u64 wall_nsec; + kvm->arch.wall_clock = wall_clock; + if (!wall_clock) return; @@ -1819,6 +1959,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; + if (!(system_time & 1)) + return; + + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = true; + + return; +} + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -1978,12 +2146,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = vcpu->arch.l1_tsc_offset; - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; -} - /* * Multiply tsc by a fixed point number represented by ratio. * @@ -2045,14 +2207,13 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; - u64 data = msr->data; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2061,7 +2222,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -2131,9 +2292,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -2148,8 +2306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { @@ -2695,24 +2851,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; u8 *page; - int r; - r = -E2BIG; if (page_num >= blob_size) - goto out; - r = -ENOMEM; + return 1; + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) @@ -2730,9 +2881,17 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; - if (!lapic_in_kernel(vcpu)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) return 1; + if (!lapic_in_kernel(vcpu)) + return data ? 1 : 0; + vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { @@ -2807,10 +2966,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -2944,7 +3105,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && @@ -2965,53 +3132,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(data & 1)) - break; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; - } case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3028,11 +3196,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3221,9 +3395,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; - case MSR_IA32_TSC: - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * On userspace reads and writes, however, we unconditionally + * return L1's TSC value to ensure backwards-compatible + * behavior for migration. + */ + u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : + vcpu->arch.tsc_offset; + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; break; + } case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -3513,6 +3700,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4383,6 +4573,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -5033,6 +5228,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -5040,6 +5239,110 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + bool empty = true; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) + empty &= !filter.ranges[i].nmsrs; + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + kvm_clear_msr_filter(kvm); + + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5346,6 +5649,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -5707,6 +6013,9 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + return 1; + if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && @@ -6362,13 +6671,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6912,7 +7241,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6920,6 +7252,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); @@ -7514,9 +7847,9 @@ int kvm_arch_init(void *opaque) goto out_free_x86_fpu_cache; } - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } @@ -7549,7 +7882,7 @@ int kvm_arch_init(void *opaque) return 0; out_free_percpu: - free_percpu(shared_msrs); + free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: @@ -7576,7 +7909,7 @@ void kvm_arch_exit(void) #endif kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); + free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); } @@ -7717,11 +8050,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -7732,9 +8070,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; @@ -8365,8 +8709,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } @@ -8473,6 +8817,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8548,7 +8894,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); + trace_kvm_entry(vcpu); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -9562,7 +9908,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; kvm_hv_vcpu_postcreate(vcpu); @@ -9570,10 +9915,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); /* poll control enabled by default */ @@ -9610,6 +9952,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } @@ -9707,7 +10050,7 @@ int kvm_arch_hardware_enable(void) u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); + kvm_user_return_msr_cpu_online(); ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -10025,6 +10368,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10041,6 +10386,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -10771,6 +11118,111 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab696dcf0..3900ab0c6004 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -372,6 +371,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e); +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); #define KVM_MSR_RET_INVALID 2 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index aa067859a70b..bad4dee4f0e4 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index d1d768912368..4304320e51f4 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -253,28 +253,17 @@ EXPORT_SYMBOL(csum_partial) /* unsigned int csum_partial_copy_generic (const char *src, char *dst, - int len, int sum, int *src_err_ptr, int *dst_err_ptr) + int len) */ /* * Copy from ds while checksumming, otherwise like csum_partial - * - * The macros SRC and DST specify the type of access for the instruction. - * thus we can call a custom exception handler for all access types. - * - * FIXME: could someone double-check whether I haven't mixed up some SRC and - * DST definitions? It's damn hard to trigger all cases. I hope I got - * them all but there's no guarantee. */ -#define SRC(y...) \ +#define EXC(y...) \ 9999: y; \ _ASM_EXTABLE_UA(9999b, 6001f) -#define DST(y...) \ - 9999: y; \ - _ASM_EXTABLE_UA(9999b, 6002f) - #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 @@ -285,20 +274,20 @@ SYM_FUNC_START(csum_partial_copy_generic) pushl %edi pushl %esi pushl %ebx - movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src movl ARGBASE+8(%esp),%edi # dst + movl $-1, %eax # sum testl $2, %edi # Check alignment. jz 2f # Jump if alignment is ok. subl $2, %ecx # Alignment uses up two bytes. jae 1f # Jump if we had at least two bytes. addl $2, %ecx # ecx was < 2. Deal with it. jmp 4f -SRC(1: movw (%esi), %bx ) +EXC(1: movw (%esi), %bx ) addl $2, %esi -DST( movw %bx, (%edi) ) +EXC( movw %bx, (%edi) ) addl $2, %edi addw %bx, %ax adcl $0, %eax @@ -306,34 +295,34 @@ DST( movw %bx, (%edi) ) movl %ecx, FP(%esp) shrl $5, %ecx jz 2f - testl %esi, %esi -SRC(1: movl (%esi), %ebx ) -SRC( movl 4(%esi), %edx ) + testl %esi, %esi # what's wrong with clc? +EXC(1: movl (%esi), %ebx ) +EXC( movl 4(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, (%edi) ) +EXC( movl %ebx, (%edi) ) adcl %edx, %eax -DST( movl %edx, 4(%edi) ) +EXC( movl %edx, 4(%edi) ) -SRC( movl 8(%esi), %ebx ) -SRC( movl 12(%esi), %edx ) +EXC( movl 8(%esi), %ebx ) +EXC( movl 12(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 8(%edi) ) +EXC( movl %ebx, 8(%edi) ) adcl %edx, %eax -DST( movl %edx, 12(%edi) ) +EXC( movl %edx, 12(%edi) ) -SRC( movl 16(%esi), %ebx ) -SRC( movl 20(%esi), %edx ) +EXC( movl 16(%esi), %ebx ) +EXC( movl 20(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 16(%edi) ) +EXC( movl %ebx, 16(%edi) ) adcl %edx, %eax -DST( movl %edx, 20(%edi) ) +EXC( movl %edx, 20(%edi) ) -SRC( movl 24(%esi), %ebx ) -SRC( movl 28(%esi), %edx ) +EXC( movl 24(%esi), %ebx ) +EXC( movl 28(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 24(%edi) ) +EXC( movl %ebx, 24(%edi) ) adcl %edx, %eax -DST( movl %edx, 28(%edi) ) +EXC( movl %edx, 28(%edi) ) lea 32(%esi), %esi lea 32(%edi), %edi @@ -345,9 +334,9 @@ DST( movl %edx, 28(%edi) ) andl $0x1c, %edx je 4f shrl $2, %edx # This clears CF -SRC(3: movl (%esi), %ebx ) +EXC(3: movl (%esi), %ebx ) adcl %ebx, %eax -DST( movl %ebx, (%edi) ) +EXC( movl %ebx, (%edi) ) lea 4(%esi), %esi lea 4(%edi), %edi dec %edx @@ -357,39 +346,24 @@ DST( movl %ebx, (%edi) ) jz 7f cmpl $2, %ecx jb 5f -SRC( movw (%esi), %cx ) +EXC( movw (%esi), %cx ) leal 2(%esi), %esi -DST( movw %cx, (%edi) ) +EXC( movw %cx, (%edi) ) leal 2(%edi), %edi je 6f shll $16,%ecx -SRC(5: movb (%esi), %cl ) -DST( movb %cl, (%edi) ) +EXC(5: movb (%esi), %cl ) +EXC( movb %cl, (%edi) ) 6: addl %ecx, %eax adcl $0, %eax 7: -5000: # Exception handler: .section .fixup, "ax" 6001: - movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - - # zero the complete destination - computing the rest - # is too much work - movl ARGBASE+8(%esp), %edi # dst - movl ARGBASE+12(%esp), %ecx # len - xorl %eax,%eax - rep ; stosb - - jmp 5000b - -6002: - movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT,(%ebx) - jmp 5000b + xorl %eax, %eax + jmp 7b .previous @@ -405,14 +379,14 @@ SYM_FUNC_END(csum_partial_copy_generic) /* Version for PentiumII/PPro */ #define ROUND1(x) \ - SRC(movl x(%esi), %ebx ) ; \ + EXC(movl x(%esi), %ebx ) ; \ addl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; + EXC(movl %ebx, x(%edi) ) ; #define ROUND(x) \ - SRC(movl x(%esi), %ebx ) ; \ + EXC(movl x(%esi), %ebx ) ; \ adcl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; + EXC(movl %ebx, x(%edi) ) ; #define ARGBASE 12 @@ -423,7 +397,7 @@ SYM_FUNC_START(csum_partial_copy_generic) movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len - movl ARGBASE+16(%esp),%eax #sum + movl $-1, %eax #sum # movl %ecx, %edx movl %ecx, %ebx movl %esi, %edx @@ -439,7 +413,7 @@ SYM_FUNC_START(csum_partial_copy_generic) JMP_NOSPEC ebx 1: addl $64,%esi addl $64,%edi - SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) + EXC(movb -32(%edx),%bl) ; EXC(movb (%edx),%bl) ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) @@ -453,29 +427,20 @@ SYM_FUNC_START(csum_partial_copy_generic) jz 7f cmpl $2, %edx jb 5f -SRC( movw (%esi), %dx ) +EXC( movw (%esi), %dx ) leal 2(%esi), %esi -DST( movw %dx, (%edi) ) +EXC( movw %dx, (%edi) ) leal 2(%edi), %edi je 6f shll $16,%edx 5: -SRC( movb (%esi), %dl ) -DST( movb %dl, (%edi) ) +EXC( movb (%esi), %dl ) +EXC( movb %dl, (%edi) ) 6: addl %edx, %eax adcl $0, %eax 7: .section .fixup, "ax" -6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - # zero the complete destination (computing the rest is too much work) - movl ARGBASE+8(%esp),%edi # dst - movl ARGBASE+12(%esp),%ecx # len - xorl %eax,%eax - rep; stosb - jmp 7b -6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT, (%ebx) +6001: xorl %eax, %eax jmp 7b .previous diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c new file mode 100644 index 000000000000..c13e8c9ee926 --- /dev/null +++ b/arch/x86/lib/copy_mc.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ + +#include <linux/jump_label.h> +#include <linux/uaccess.h> +#include <linux/export.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <asm/mce.h> + +#ifdef CONFIG_X86_MCE +/* + * See COPY_MC_TEST for self-test of the copy_mc_fragile() + * implementation. + */ +static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); + +void enable_copy_mc_fragile(void) +{ + static_branch_inc(©_mc_fragile_key); +} +#define copy_mc_fragile_enabled (static_branch_unlikely(©_mc_fragile_key)) + +/* + * Similar to copy_user_handle_tail, probe for the write fault point, or + * source exception point. + */ +__visible notrace unsigned long +copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) +{ + for (; len; --len, to++, from++) + if (copy_mc_fragile(to, from, 1)) + break; + return len; +} +#else +/* + * No point in doing careful copying, or consulting a static key when + * there is no #MC handler in the CONFIG_X86_MCE=n case. + */ +void enable_copy_mc_fragile(void) +{ +} +#define copy_mc_fragile_enabled (0) +#endif + +unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); + +/** + * copy_mc_to_kernel - memory copy that handles source exceptions + * + * @dst: destination address + * @src: source address + * @len: number of bytes to copy + * + * Call into the 'fragile' version on systems that benefit from avoiding + * corner case poison consumption scenarios, For example, accessing + * poison across 2 cachelines with a single instruction. Almost all + * other uses case can use copy_mc_enhanced_fast_string() for a fast + * recoverable copy, or fallback to plain memcpy. + * + * Return 0 for success, or number of bytes not copied if there was an + * exception. + */ +unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) +{ + if (copy_mc_fragile_enabled) + return copy_mc_fragile(dst, src, len); + if (static_cpu_has(X86_FEATURE_ERMS)) + return copy_mc_enhanced_fast_string(dst, src, len); + memcpy(dst, src, len); + return 0; +} +EXPORT_SYMBOL_GPL(copy_mc_to_kernel); + +unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned len) +{ + unsigned long ret; + + if (copy_mc_fragile_enabled) { + __uaccess_begin(); + ret = copy_mc_fragile(dst, src, len); + __uaccess_end(); + return ret; + } + + if (static_cpu_has(X86_FEATURE_ERMS)) { + __uaccess_begin(); + ret = copy_mc_enhanced_fast_string(dst, src, len); + __uaccess_end(); + return ret; + } + + return copy_user_generic(dst, src, len); +} diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S new file mode 100644 index 000000000000..892d8915f609 --- /dev/null +++ b/arch/x86/lib/copy_mc_64.S @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ + +#include <linux/linkage.h> +#include <asm/copy_mc_test.h> +#include <asm/export.h> +#include <asm/asm.h> + +#ifndef CONFIG_UML + +#ifdef CONFIG_X86_MCE +COPY_MC_TEST_CTL + +/* + * copy_mc_fragile - copy memory with indication if an exception / fault happened + * + * The 'fragile' version is opted into by platform quirks and takes + * pains to avoid unrecoverable corner cases like 'fast-string' + * instruction sequences, and consuming poison across a cacheline + * boundary. The non-fragile version is equivalent to memcpy() + * regardless of CPU machine-check-recovery capability. + */ +SYM_FUNC_START(copy_mc_fragile) + cmpl $8, %edx + /* Less than 8 bytes? Go to byte copy loop */ + jb .L_no_whole_words + + /* Check for bad alignment of source */ + testl $7, %esi + /* Already aligned */ + jz .L_8byte_aligned + + /* Copy one byte at a time until source is 8-byte aligned */ + movl %esi, %ecx + andl $7, %ecx + subl $8, %ecx + negl %ecx + subl %ecx, %edx +.L_read_leading_bytes: + movb (%rsi), %al + COPY_MC_TEST_SRC %rsi 1 .E_leading_bytes + COPY_MC_TEST_DST %rdi 1 .E_leading_bytes +.L_write_leading_bytes: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_read_leading_bytes + +.L_8byte_aligned: + movl %edx, %ecx + andl $7, %edx + shrl $3, %ecx + jz .L_no_whole_words + +.L_read_words: + movq (%rsi), %r8 + COPY_MC_TEST_SRC %rsi 8 .E_read_words + COPY_MC_TEST_DST %rdi 8 .E_write_words +.L_write_words: + movq %r8, (%rdi) + addq $8, %rsi + addq $8, %rdi + decl %ecx + jnz .L_read_words + + /* Any trailing bytes? */ +.L_no_whole_words: + andl %edx, %edx + jz .L_done_memcpy_trap + + /* Copy trailing bytes */ + movl %edx, %ecx +.L_read_trailing_bytes: + movb (%rsi), %al + COPY_MC_TEST_SRC %rsi 1 .E_trailing_bytes + COPY_MC_TEST_DST %rdi 1 .E_trailing_bytes +.L_write_trailing_bytes: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_read_trailing_bytes + + /* Copy successful. Return zero */ +.L_done_memcpy_trap: + xorl %eax, %eax +.L_done: + ret +SYM_FUNC_END(copy_mc_fragile) +EXPORT_SYMBOL_GPL(copy_mc_fragile) + + .section .fixup, "ax" + /* + * Return number of bytes not copied for any failure. Note that + * there is no "tail" handling since the source buffer is 8-byte + * aligned and poison is cacheline aligned. + */ +.E_read_words: + shll $3, %ecx +.E_leading_bytes: + addl %edx, %ecx +.E_trailing_bytes: + mov %ecx, %eax + jmp .L_done + + /* + * For write fault handling, given the destination is unaligned, + * we handle faults on multi-byte writes with a byte-by-byte + * copy up to the write-protected page. + */ +.E_write_words: + shll $3, %ecx + addl %edx, %ecx + movl %ecx, %edx + jmp copy_mc_fragile_handle_tail + + .previous + + _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) + _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE(.L_write_words, .E_write_words) + _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) +#endif /* CONFIG_X86_MCE */ + +/* + * copy_mc_enhanced_fast_string - memory copy with exception handling + * + * Fast string copy + fault / exception handling. If the CPU does + * support machine check exception recovery, but does not support + * recovering from fast-string exceptions then this CPU needs to be + * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any + * machine check recovery support this version should be no slower than + * standard memcpy. + */ +SYM_FUNC_START(copy_mc_enhanced_fast_string) + movq %rdi, %rax + movq %rdx, %rcx +.L_copy: + rep movsb + /* Copy successful. Return zero */ + xorl %eax, %eax + ret +SYM_FUNC_END(copy_mc_enhanced_fast_string) + + .section .fixup, "ax" +.E_copy: + /* + * On fault %rcx is updated such that the copy instruction could + * optionally be restarted at the fault position, i.e. it + * contains 'bytes remaining'. A non-zero return indicates error + * to copy_mc_generic() users, or indicate short transfers to + * user-copy routines. + */ + movq %rcx, %rax + ret + + .previous + + _ASM_EXTABLE_FAULT(.L_copy, .E_copy) +#endif /* !CONFIG_UML */ diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 816f128a6d52..77b9b2a3b5c8 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -15,6 +15,7 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> +#include <asm/trapnr.h> .macro ALIGN_DESTINATION /* check for bad alignment of destination */ @@ -36,8 +37,8 @@ jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) + _ASM_EXTABLE_CPY(100b, 103b) + _ASM_EXTABLE_CPY(101b, 103b) .endm /* @@ -116,26 +117,26 @@ SYM_FUNC_START(copy_user_generic_unrolled) 60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */ .previous - _ASM_EXTABLE_UA(1b, 30b) - _ASM_EXTABLE_UA(2b, 30b) - _ASM_EXTABLE_UA(3b, 30b) - _ASM_EXTABLE_UA(4b, 30b) - _ASM_EXTABLE_UA(5b, 30b) - _ASM_EXTABLE_UA(6b, 30b) - _ASM_EXTABLE_UA(7b, 30b) - _ASM_EXTABLE_UA(8b, 30b) - _ASM_EXTABLE_UA(9b, 30b) - _ASM_EXTABLE_UA(10b, 30b) - _ASM_EXTABLE_UA(11b, 30b) - _ASM_EXTABLE_UA(12b, 30b) - _ASM_EXTABLE_UA(13b, 30b) - _ASM_EXTABLE_UA(14b, 30b) - _ASM_EXTABLE_UA(15b, 30b) - _ASM_EXTABLE_UA(16b, 30b) - _ASM_EXTABLE_UA(18b, 40b) - _ASM_EXTABLE_UA(19b, 40b) - _ASM_EXTABLE_UA(21b, 50b) - _ASM_EXTABLE_UA(22b, 50b) + _ASM_EXTABLE_CPY(1b, 30b) + _ASM_EXTABLE_CPY(2b, 30b) + _ASM_EXTABLE_CPY(3b, 30b) + _ASM_EXTABLE_CPY(4b, 30b) + _ASM_EXTABLE_CPY(5b, 30b) + _ASM_EXTABLE_CPY(6b, 30b) + _ASM_EXTABLE_CPY(7b, 30b) + _ASM_EXTABLE_CPY(8b, 30b) + _ASM_EXTABLE_CPY(9b, 30b) + _ASM_EXTABLE_CPY(10b, 30b) + _ASM_EXTABLE_CPY(11b, 30b) + _ASM_EXTABLE_CPY(12b, 30b) + _ASM_EXTABLE_CPY(13b, 30b) + _ASM_EXTABLE_CPY(14b, 30b) + _ASM_EXTABLE_CPY(15b, 30b) + _ASM_EXTABLE_CPY(16b, 30b) + _ASM_EXTABLE_CPY(18b, 40b) + _ASM_EXTABLE_CPY(19b, 40b) + _ASM_EXTABLE_CPY(21b, 50b) + _ASM_EXTABLE_CPY(22b, 50b) SYM_FUNC_END(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) @@ -180,8 +181,8 @@ SYM_FUNC_START(copy_user_generic_string) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, 11b) - _ASM_EXTABLE_UA(3b, 12b) + _ASM_EXTABLE_CPY(1b, 11b) + _ASM_EXTABLE_CPY(3b, 12b) SYM_FUNC_END(copy_user_generic_string) EXPORT_SYMBOL(copy_user_generic_string) @@ -213,7 +214,7 @@ SYM_FUNC_START(copy_user_enhanced_fast_string) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, 12b) + _ASM_EXTABLE_CPY(1b, 12b) SYM_FUNC_END(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) @@ -221,6 +222,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Try to copy last bytes and clear the rest if needed. * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened * * Input: * rdi destination @@ -232,12 +234,25 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret - _ASM_EXTABLE_UA(1b, 2b) + /* + * Return zero to pretend that this copy succeeded. This + * is counter-intuitive, but needed to prevent the code + * in lib/iov_iter.c from retrying and running back into + * the poison cache line again. The machine check handler + * will ensure that a SIGBUS is sent to the task. + */ +3: xorl %eax,%eax + ASM_CLAC + ret + + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) /* @@ -366,27 +381,27 @@ SYM_FUNC_START(__copy_user_nocache) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_UA(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_UA(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy) + _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) + _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) + _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) + _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) + _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) + _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) SYM_FUNC_END(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 3394a8ff7fd0..1fbd8ee9642d 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -18,9 +18,6 @@ * rdi source * rsi destination * edx len (32bit) - * ecx sum (32bit) - * r8 src_err_ptr (int) - * r9 dst_err_ptr (int) * * Output * eax 64bit sum. undefined in case of exception. @@ -31,44 +28,32 @@ .macro source 10: - _ASM_EXTABLE_UA(10b, .Lbad_source) + _ASM_EXTABLE_UA(10b, .Lfault) .endm .macro dest 20: - _ASM_EXTABLE_UA(20b, .Lbad_dest) + _ASM_EXTABLE_UA(20b, .Lfault) .endm - /* - * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a - * potentially unmapped kernel address. - */ - .macro ignore L=.Lignore -30: - _ASM_EXTABLE(30b, \L) - .endm - - SYM_FUNC_START(csum_partial_copy_generic) - cmpl $3*64, %edx - jle .Lignore - -.Lignore: - subq $7*8, %rsp - movq %rbx, 2*8(%rsp) - movq %r12, 3*8(%rsp) - movq %r14, 4*8(%rsp) - movq %r13, 5*8(%rsp) - movq %r15, 6*8(%rsp) + subq $5*8, %rsp + movq %rbx, 0*8(%rsp) + movq %r12, 1*8(%rsp) + movq %r14, 2*8(%rsp) + movq %r13, 3*8(%rsp) + movq %r15, 4*8(%rsp) - movq %r8, (%rsp) - movq %r9, 1*8(%rsp) - - movl %ecx, %eax + movl $-1, %eax + xorl %r9d, %r9d movl %edx, %ecx + cmpl $8, %ecx + jb .Lshort - xorl %r9d, %r9d - movq %rcx, %r12 + testb $7, %sil + jne .Lunaligned +.Laligned: + movl %ecx, %r12d shrq $6, %r12 jz .Lhandle_tail /* < 64 */ @@ -99,7 +84,12 @@ SYM_FUNC_START(csum_partial_copy_generic) source movq 56(%rdi), %r13 - ignore 2f +30: + /* + * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a + * potentially unmapped kernel address. + */ + _ASM_EXTABLE(30b, 2f) prefetcht0 5*64(%rdi) 2: adcq %rbx, %rax @@ -131,8 +121,6 @@ SYM_FUNC_START(csum_partial_copy_generic) dest movq %r13, 56(%rsi) -3: - leaq 64(%rdi), %rdi leaq 64(%rsi), %rsi @@ -142,8 +130,8 @@ SYM_FUNC_START(csum_partial_copy_generic) /* do last up to 56 bytes */ .Lhandle_tail: - /* ecx: count */ - movl %ecx, %r10d + /* ecx: count, rcx.63: the end result needs to be rol8 */ + movq %rcx, %r10 andl $63, %ecx shrl $3, %ecx jz .Lfold @@ -172,6 +160,7 @@ SYM_FUNC_START(csum_partial_copy_generic) .Lhandle_7: movl %r10d, %ecx andl $7, %ecx +.L1: /* .Lshort rejoins the common path here */ shrl $1, %ecx jz .Lhandle_1 movl $2, %edx @@ -203,26 +192,65 @@ SYM_FUNC_START(csum_partial_copy_generic) adcl %r9d, %eax /* carry */ .Lende: - movq 2*8(%rsp), %rbx - movq 3*8(%rsp), %r12 - movq 4*8(%rsp), %r14 - movq 5*8(%rsp), %r13 - movq 6*8(%rsp), %r15 - addq $7*8, %rsp + testq %r10, %r10 + js .Lwas_odd +.Lout: + movq 0*8(%rsp), %rbx + movq 1*8(%rsp), %r12 + movq 2*8(%rsp), %r14 + movq 3*8(%rsp), %r13 + movq 4*8(%rsp), %r15 + addq $5*8, %rsp ret +.Lshort: + movl %ecx, %r10d + jmp .L1 +.Lunaligned: + xorl %ebx, %ebx + testb $1, %sil + jne .Lodd +1: testb $2, %sil + je 2f + source + movw (%rdi), %bx + dest + movw %bx, (%rsi) + leaq 2(%rdi), %rdi + subq $2, %rcx + leaq 2(%rsi), %rsi + addq %rbx, %rax +2: testb $4, %sil + je .Laligned + source + movl (%rdi), %ebx + dest + movl %ebx, (%rsi) + leaq 4(%rdi), %rdi + subq $4, %rcx + leaq 4(%rsi), %rsi + addq %rbx, %rax + jmp .Laligned + +.Lodd: + source + movb (%rdi), %bl + dest + movb %bl, (%rsi) + leaq 1(%rdi), %rdi + leaq 1(%rsi), %rsi + /* decrement, set MSB */ + leaq -1(%rcx, %rcx), %rcx + rorq $1, %rcx + shll $8, %ebx + addq %rbx, %rax + jmp 1b + +.Lwas_odd: + roll $8, %eax + jmp .Lout - /* Exception handlers. Very simple, zeroing is done in the wrappers */ -.Lbad_source: - movq (%rsp), %rax - testq %rax, %rax - jz .Lende - movl $-EFAULT, (%rax) - jmp .Lende - -.Lbad_dest: - movq 8(%rsp), %rax - testq %rax, %rax - jz .Lende - movl $-EFAULT, (%rax) - jmp .Lende + /* Exception: just return 0 */ +.Lfault: + xorl %eax, %eax + jmp .Lout SYM_FUNC_END(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index ee63d7576fd2..189344924a2b 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -21,52 +21,16 @@ * src and dst are best aligned to 64bits. */ __wsum -csum_and_copy_from_user(const void __user *src, void *dst, - int len, __wsum isum, int *errp) +csum_and_copy_from_user(const void __user *src, void *dst, int len) { - might_sleep(); - *errp = 0; + __wsum sum; + might_sleep(); if (!user_access_begin(src, len)) - goto out_err; - - /* - * Why 6, not 7? To handle odd addresses aligned we - * would need to do considerable complications to fix the - * checksum which is defined as an 16bit accumulator. The - * fix alignment code is primarily for performance - * compatibility with 32bit and that will handle odd - * addresses slowly too. - */ - if (unlikely((unsigned long)src & 6)) { - while (((unsigned long)src & 6) && len >= 2) { - __u16 val16; - - unsafe_get_user(val16, (const __u16 __user *)src, out); - - *(__u16 *)dst = val16; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - src += 2; - dst += 2; - len -= 2; - } - } - isum = csum_partial_copy_generic((__force const void *)src, - dst, len, isum, errp, NULL); - user_access_end(); - if (unlikely(*errp)) - goto out_err; - - return isum; - -out: + return 0; + sum = csum_partial_copy_generic((__force const void *)src, dst, len); user_access_end(); -out_err: - *errp = -EFAULT; - memset(dst, 0, len); - - return isum; + return sum; } EXPORT_SYMBOL(csum_and_copy_from_user); @@ -82,40 +46,16 @@ EXPORT_SYMBOL(csum_and_copy_from_user); * src and dst are best aligned to 64bits. */ __wsum -csum_and_copy_to_user(const void *src, void __user *dst, - int len, __wsum isum, int *errp) +csum_and_copy_to_user(const void *src, void __user *dst, int len) { - __wsum ret; + __wsum sum; might_sleep(); - - if (!user_access_begin(dst, len)) { - *errp = -EFAULT; + if (!user_access_begin(dst, len)) return 0; - } - - if (unlikely((unsigned long)dst & 6)) { - while (((unsigned long)dst & 6) && len >= 2) { - __u16 val16 = *(__u16 *)src; - - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - unsafe_put_user(val16, (__u16 __user *)dst, out); - src += 2; - dst += 2; - len -= 2; - } - } - - *errp = 0; - ret = csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); - user_access_end(); - return ret; -out: + sum = csum_partial_copy_generic(src, (void __force *)dst, len); user_access_end(); - *errp = -EFAULT; - return isum; + return sum; } EXPORT_SYMBOL(csum_and_copy_to_user); @@ -129,9 +69,9 @@ EXPORT_SYMBOL(csum_and_copy_to_user); * Returns an 32bit unfolded checksum of the buffer. */ __wsum -csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) +csum_partial_copy_nocheck(const void *src, void *dst, int len) { - return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); + return csum_partial_copy_generic(src, dst, len); } EXPORT_SYMBOL(csum_partial_copy_nocheck); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index c8a85b512796..fa1bc2104b32 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -35,10 +35,21 @@ #include <asm/smap.h> #include <asm/export.h> +#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC + +#ifdef CONFIG_X86_5LEVEL +#define LOAD_TASK_SIZE_MINUS_N(n) \ + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ + __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57 +#else +#define LOAD_TASK_SIZE_MINUS_N(n) \ + mov $(TASK_SIZE_MAX - (n)),%_ASM_DX +#endif + .text SYM_FUNC_START(__get_user_1) - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(0) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX @@ -51,15 +62,13 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) - add $1,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(1) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -2: movzwl -1(%_ASM_AX),%edx +2: movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -67,15 +76,13 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) - add $3,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(3) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -3: movl -3(%_ASM_AX),%edx +3: movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -84,29 +91,25 @@ EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) #ifdef CONFIG_X86_64 - add $7,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(7) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -4: movq -7(%_ASM_AX),%rdx +4: movq (%_ASM_AX),%rdx xor %eax,%eax ASM_CLAC ret #else - add $7,%_ASM_AX - jc bad_get_user_8 - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(7) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user_8 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -4: movl -7(%_ASM_AX),%edx -5: movl -3(%_ASM_AX),%ecx +4: movl (%_ASM_AX),%edx +5: movl 4(%_ASM_AX),%ecx xor %eax,%eax ASM_CLAC ret @@ -114,6 +117,52 @@ SYM_FUNC_START(__get_user_8) SYM_FUNC_END(__get_user_8) EXPORT_SYMBOL(__get_user_8) +/* .. and the same for __get_user, just without the range checks */ +SYM_FUNC_START(__get_user_nocheck_1) + ASM_STAC + ASM_BARRIER_NOSPEC +6: movzbl (%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret +SYM_FUNC_END(__get_user_nocheck_1) +EXPORT_SYMBOL(__get_user_nocheck_1) + +SYM_FUNC_START(__get_user_nocheck_2) + ASM_STAC + ASM_BARRIER_NOSPEC +7: movzwl (%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret +SYM_FUNC_END(__get_user_nocheck_2) +EXPORT_SYMBOL(__get_user_nocheck_2) + +SYM_FUNC_START(__get_user_nocheck_4) + ASM_STAC + ASM_BARRIER_NOSPEC +8: movl (%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret +SYM_FUNC_END(__get_user_nocheck_4) +EXPORT_SYMBOL(__get_user_nocheck_4) + +SYM_FUNC_START(__get_user_nocheck_8) + ASM_STAC + ASM_BARRIER_NOSPEC +#ifdef CONFIG_X86_64 +9: movq (%_ASM_AX),%rdx +#else +9: movl (%_ASM_AX),%edx +10: movl 4(%_ASM_AX),%ecx +#endif + xor %eax,%eax + ASM_CLAC + ret +SYM_FUNC_END(__get_user_nocheck_8) +EXPORT_SYMBOL(__get_user_nocheck_8) + SYM_CODE_START_LOCAL(.Lbad_get_user_clac) ASM_CLAC @@ -134,6 +183,7 @@ bad_get_user_8: SYM_CODE_END(.Lbad_get_user_8_clac) #endif +/* get_user */ _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac) _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac) @@ -143,3 +193,14 @@ SYM_CODE_END(.Lbad_get_user_8_clac) _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac) _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac) #endif + +/* __get_user */ + _ASM_EXTABLE_UA(6b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(7b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(8b, .Lbad_get_user_clac) +#ifdef CONFIG_X86_64 + _ASM_EXTABLE_UA(9b, .Lbad_get_user_clac) +#else + _ASM_EXTABLE_UA(9b, .Lbad_get_user_8_clac) + _ASM_EXTABLE_UA(10b, .Lbad_get_user_8_clac) +#endif diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 5e69603ff63f..58f7fb95c7f4 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -20,6 +20,7 @@ enum reg_type { REG_TYPE_RM = 0, + REG_TYPE_REG, REG_TYPE_INDEX, REG_TYPE_BASE, }; @@ -53,6 +54,30 @@ static bool is_string_insn(struct insn *insn) } /** + * insn_has_rep_prefix() - Determine if instruction has a REP prefix + * @insn: Instruction containing the prefix to inspect + * + * Returns: + * + * true if the instruction has a REP prefix, false if not. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + int i; + + insn_get_prefixes(insn); + + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_byte_t p = insn->prefixes.bytes[i]; + + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/** * get_seg_reg_override_idx() - obtain segment register override index * @insn: Valid instruction with segment override prefixes * @@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, regno += 8; break; + case REG_TYPE_REG: + regno = X86_MODRM_REG(insn->modrm.value); + + if (X86_REX_R(insn->rex_prefix.value)) + regno += 8; + break; + case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); if (X86_REX_X(insn->rex_prefix.value)) @@ -808,6 +840,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) } /** + * insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the reg part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. + */ +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs) +{ + return get_reg_offset(insn, regs, REG_TYPE_REG); +} + +/** * get_seg_base_limit() - obtain base address and limit of a segment * @insn: Instruction. Must be valid. * @regs: Register values as seen when entering kernel mode @@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) return (void __user *)-1L; } } + +/** + * insn_fetch_from_user() - Copy instruction bytes from user-space memory + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long seg_base = 0; + int not_copied; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + MAX_INSN_SIZE); + + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_decode() - Decode an instruction + * @insn: Structure to store decoded instruction + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Buffer containing the instruction bytes + * @buf_size: Number of instruction bytes available in buf + * + * Decodes the instruction provided in buf and stores the decoding results in + * insn. Also determines the correct address and operand sizes. + * + * Returns: + * + * True if instruction was decoded, False otherwise. + */ +bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size) +{ + int seg_defs; + + insn_init(insn, buf, buf_size, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(insn); + if (buf_size < insn->length) + return false; + + return true; +} diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bbcc05bcefad..037faac46b0c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,7 +4,6 @@ #include <linux/linkage.h> #include <asm/errno.h> #include <asm/cpufeatures.h> -#include <asm/mcsafe_test.h> #include <asm/alternative-asm.h> #include <asm/export.h> @@ -187,117 +186,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig) SYM_FUNC_END(memcpy_orig) .popsection - -#ifndef CONFIG_UML - -MCSAFE_TEST_CTL - -/* - * __memcpy_mcsafe - memory copy with machine check exception handling - * Note that we only catch machine checks when reading the source addresses. - * Writes to target are posted and don't generate machine checks. - */ -SYM_FUNC_START(__memcpy_mcsafe) - cmpl $8, %edx - /* Less than 8 bytes? Go to byte copy loop */ - jb .L_no_whole_words - - /* Check for bad alignment of source */ - testl $7, %esi - /* Already aligned */ - jz .L_8byte_aligned - - /* Copy one byte at a time until source is 8-byte aligned */ - movl %esi, %ecx - andl $7, %ecx - subl $8, %ecx - negl %ecx - subl %ecx, %edx -.L_read_leading_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes - MCSAFE_TEST_DST %rdi 1 .E_leading_bytes -.L_write_leading_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_leading_bytes - -.L_8byte_aligned: - movl %edx, %ecx - andl $7, %edx - shrl $3, %ecx - jz .L_no_whole_words - -.L_read_words: - movq (%rsi), %r8 - MCSAFE_TEST_SRC %rsi 8 .E_read_words - MCSAFE_TEST_DST %rdi 8 .E_write_words -.L_write_words: - movq %r8, (%rdi) - addq $8, %rsi - addq $8, %rdi - decl %ecx - jnz .L_read_words - - /* Any trailing bytes? */ -.L_no_whole_words: - andl %edx, %edx - jz .L_done_memcpy_trap - - /* Copy trailing bytes */ - movl %edx, %ecx -.L_read_trailing_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes - MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes -.L_write_trailing_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_trailing_bytes - - /* Copy successful. Return zero */ -.L_done_memcpy_trap: - xorl %eax, %eax -.L_done: - ret -SYM_FUNC_END(__memcpy_mcsafe) -EXPORT_SYMBOL_GPL(__memcpy_mcsafe) - - .section .fixup, "ax" - /* - * Return number of bytes not copied for any failure. Note that - * there is no "tail" handling since the source buffer is 8-byte - * aligned and poison is cacheline aligned. - */ -.E_read_words: - shll $3, %ecx -.E_leading_bytes: - addl %edx, %ecx -.E_trailing_bytes: - mov %ecx, %eax - jmp .L_done - - /* - * For write fault handling, given the destination is unaligned, - * we handle faults on multi-byte writes with a byte-by-byte - * copy up to the write-protected page. - */ -.E_write_words: - shll $3, %ecx - addl %edx, %ecx - movl %ecx, %edx - jmp mcsafe_handle_tail - - .previous - - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) - _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE(.L_write_words, .E_write_words) - _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) -#endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 7c7c92db8497..0ea344c5ea43 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -25,76 +25,87 @@ * Inputs: %eax[:%edx] contains the data * %ecx contains the address * - * Outputs: %eax is error code (0 or -EFAULT) + * Outputs: %ecx is error code (0 or -EFAULT) + * + * Clobbers: %ebx needed for task pointer * * These functions should not modify any other registers, * as they get called from within inline assembly. */ -#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX +#ifdef CONFIG_X86_5LEVEL +#define LOAD_TASK_SIZE_MINUS_N(n) \ + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ + __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57 +#else +#define LOAD_TASK_SIZE_MINUS_N(n) \ + mov $(TASK_SIZE_MAX - (n)),%_ASM_BX +#endif .text SYM_FUNC_START(__put_user_1) - ENTER - cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX + LOAD_TASK_SIZE_MINUS_N(0) + cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user +SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) ASM_STAC 1: movb %al,(%_ASM_CX) - xor %eax,%eax + xor %ecx,%ecx ASM_CLAC ret SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) +EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $1,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user +SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) ASM_STAC 2: movw %ax,(%_ASM_CX) - xor %eax,%eax + xor %ecx,%ecx ASM_CLAC ret SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) +EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $3,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user +SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) ASM_STAC 3: movl %eax,(%_ASM_CX) - xor %eax,%eax + xor %ecx,%ecx ASM_CLAC ret SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) +EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $7,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user +SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) ASM_STAC 4: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 5: movl %edx,4(%_ASM_CX) #endif - xor %eax,%eax + xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) +EXPORT_SYMBOL(__put_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_put_user_clac) ASM_CLAC .Lbad_put_user: - movl $-EFAULT,%eax + movl $-EFAULT,%ecx RET SYM_CODE_END(.Lbad_put_user_clac) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index b0dfac3d3df7..508c81e97ab1 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -56,27 +56,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Similar to copy_user_handle_tail, probe for the write fault point, - * but reuse __memcpy_mcsafe in case a new read error is encountered. - * clac() is handled in _copy_to_iter_mcsafe(). - */ -__visible notrace unsigned long -mcsafe_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++, from++) { - /* - * Call the assembly routine back directly since - * memcpy_mcsafe() may silently fallback to memcpy. - */ - unsigned long rem = __memcpy_mcsafe(to, from, 1); - - if (rem) - break; - } - return len; -} - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB @@ -120,7 +99,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) */ if (size < 8) { if (!IS_ALIGNED(dest, 4) || size != 4) - clean_cache_range(dst, 1); + clean_cache_range(dst, size); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 770b613790b3..f5e1e60c9095 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif -struct cpu_entry_area *get_cpu_entry_area(int cpu) +/* Is called from entry code, so must be noinstr */ +noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) { unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1d6cb07f4f86..b93d6cd08a7f 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,6 +5,7 @@ #include <xen/xen.h> #include <asm/fpu/internal.h> +#include <asm/sev-es.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -80,6 +81,18 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_uaccess); +__visible bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL(ex_handler_copy); + __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long error_code, @@ -125,17 +138,21 @@ __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_clear_fs); -__visible bool ex_has_fault_handler(unsigned long ip) +enum handler_type ex_get_fault_handler_type(unsigned long ip) { const struct exception_table_entry *e; ex_handler_t handler; e = search_exception_tables(ip); if (!e) - return false; + return EX_HANDLER_NONE; handler = ex_fixup_handler(e); - - return handler == ex_handler_fault; + if (handler == ex_handler_fault) + return EX_HANDLER_FAULT; + else if (handler == ex_handler_uaccess || handler == ex_handler_copy) + return EX_HANDLER_UACCESS; + else + return EX_HANDLER_OTHER; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6e3e8a124903..82bf37a5c9ec 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1128,7 +1128,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 0; } -static int fault_in_kernel_space(unsigned long address) +bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above @@ -1446,11 +1446,14 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) prefetchw(¤t->mm->mmap_lock); /* - * KVM has two types of events that are, logically, interrupts, but - * are unfortunately delivered using the #PF vector. These events are - * "you just accessed valid memory, but the host doesn't have it right - * now, so I'll put you to sleep if you continue" and "that memory - * you tried to access earlier is available now." + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delived via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a4ac13cc3fdc..b5a3fa4033d3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -217,11 +217,6 @@ static void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) -{ - sync_global_pgds(start, end); -} - /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -1257,14 +1252,19 @@ static void __init preallocate_vmalloc_pages(void) if (!p4d) goto failed; - /* - * With 5-level paging the P4D level is not folded. So the PGDs - * are now populated and there is no need to walk down to the - * PUD level. - */ if (pgtable_l5_enabled()) continue; + /* + * The goal here is to allocate all possibly required + * hardware page tables pointed to by the top hardware + * level. + * + * On 4-level systems, the P4D layer is folded away and + * the above code does no preallocation. Below, go down + * to the pud _software_ level to ensure the second + * hardware level is allocated on 4-level systems too. + */ lvl = "pud"; pud = pud_alloc(&init_mm, p4d, addr); if (!pud) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9f1177edc2e7..efbb3de472df 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -37,12 +37,13 @@ * reside in the .data section so as not to be zeroed out when the .bss * section is later cleared. */ -u64 sme_me_mask __section(.data) = 0; +u64 sme_me_mask __section(".data") = 0; +u64 sev_status __section(".data") = 0; EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -bool sev_enabled __section(.data); +bool sev_enabled __section(".data"); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); @@ -347,7 +348,13 @@ bool sme_active(void) bool sev_active(void) { - return sme_me_mask && sev_enabled; + return sev_status & MSR_AMD64_SEV_ENABLED; +} + +/* Needs to be called from non-instrumentable code */ +bool noinstr sev_es_active(void) +{ + return sev_status & MSR_AMD64_SEV_ES_ENABLED; } /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ @@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void) free_init_pages("unused decrypted", vaddr, vaddr_end); } +static void print_mem_encrypt_feature_info(void) +{ + pr_info("AMD Memory Encryption Features active:"); + + /* Secure Memory Encryption */ + if (sme_active()) { + /* + * SME is mutually exclusive with any of the SEV + * features below. + */ + pr_cont(" SME\n"); + return; + } + + /* Secure Encrypted Virtualization */ + if (sev_active()) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (sev_es_active()) + pr_cont(" SEV-ES"); + + pr_cont("\n"); +} + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -415,8 +447,6 @@ void __init mem_encrypt_init(void) if (sev_active()) static_branch_enable(&sev_enable_key); - pr_info("AMD %s active\n", - sev_active() ? "Secure Encrypted Virtualization (SEV)" - : "Secure Memory Encryption (SME)"); + print_mem_encrypt_feature_info(); } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e2b0e2ac07bb..733b983f3a89 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -81,7 +81,7 @@ struct sme_populate_pgd_data { * section is 2MB aligned to allow for simple pagetable setup using only * PMD entries (see vmlinux.lds.S). */ -static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch); +static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch"); static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; @@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp) if (!(msr & MSR_AMD64_SEV_ENABLED)) return; + /* Save SEV_STATUS to avoid reading MSR again */ + sev_status = msr; + /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index aa76ec2d359b..44148691d78b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -37,14 +37,12 @@ static __init int numa_setup(char *opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; -#ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) - numa_emu_cmdline(opt + 5); -#endif -#ifdef CONFIG_ACPI_NUMA + return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) - acpi_numa = -1; -#endif + disable_srat(); + if (!strncmp(opt, "nohmat", 6)) + disable_hmat(); return 0; } early_param("numa", numa_setup); @@ -516,7 +514,7 @@ static void __init numa_clear_kernel_node_hotplug(void) * memory ranges, because quirks such as trim_snb_memory() * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, mb_region) { + for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); if (nid != MAX_NUMNODES) @@ -748,6 +746,27 @@ static void __init init_memory_less_node(int nid) } /* + * A node may exist which has one or more Generic Initiators but no CPUs and no + * memory. + * + * This function must be called after init_cpu_to_node(), to ensure that any + * memoryless CPU nodes have already been brought online, and before the + * node_data[nid] is needed for zone list setup in build_all_zonelists(). + * + * When this function is called, any nodes containing either memory and/or CPUs + * will already be online and there is no need to do anything extra, even if + * they also contain one or more Generic Initiators. + */ +void __init init_gi_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_GENERIC_INITIATOR) + if (!node_online(nid)) + init_memory_less_node(nid); +} + +/* * Setup early cpu_to_node. * * Populate cpu_to_node[] only if x86_cpu_to_apicid[], @@ -919,7 +938,6 @@ int phys_to_target_node(phys_addr_t start) return meminfo_to_nid(&numa_reserved_meminfo, start); } -EXPORT_SYMBOL_GPL(phys_to_target_node); int memory_add_physaddr_to_nid(u64 start) { diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 683cd12f4793..87d77cc52f86 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -13,9 +13,10 @@ static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; -void __init numa_emu_cmdline(char *str) +int __init numa_emu_cmdline(char *str) { emu_cmdline = str; + return 0; } static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a889f035..40baa90e74f4 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(&cpa, 1); + cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT)); ret = __change_page_attr_set_clr(&cpa, 1); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0951b47e64c1..11666ba19b62 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,6 @@ #include <asm/nospec-branch.h> #include <asm/cache.h> #include <asm/apic.h> -#include <asm/uv/uv.h> #include "mm_internal.h" @@ -800,29 +799,6 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, trace_tlb_flush(TLB_REMOTE_SEND_IPI, (info->end - info->start) >> PAGE_SHIFT); - if (is_uv_system()) { - /* - * This whole special case is confused. UV has a "Broadcast - * Assist Unit", which seems to be a fancy way to send IPIs. - * Back when x86 used an explicit TLB flush IPI, UV was - * optimized to use its own mechanism. These days, x86 uses - * smp_call_function_many(), but UV still uses a manual IPI, - * and that IPI's action is out of date -- it does a manual - * flush instead of calling flush_tlb_func_remote(). This - * means that the percpu tlb_gen variables won't be updated - * and we'll do pointless flushes on future context switches. - * - * Rather than hooking native_flush_tlb_others() here, I think - * that UV should be updated so that smp_call_function_many(), - * etc, are optimal on UV. - */ - cpumask = uv_flush_tlb_others(cpumask, info); - if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - /* * If no page tables were freed, we can skip sending IPIs to * CPUs in lazy TLB mode. They will flush the CPU themselves diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 42b6709e6dc7..796506dcfc42 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -221,14 +221,48 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 +/* Number of bytes that will be skipped on tailcall */ +#define X86_TAIL_CALL_OFFSET 11 -#define PROLOGUE_SIZE 25 +static void push_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[0]) + EMIT1(0x53); /* push rbx */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x55); /* push r13 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x56); /* push r14 */ + if (callee_regs_used[3]) + EMIT2(0x41, 0x57); /* push r15 */ + *pprog = prog; +} + +static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[3]) + EMIT2(0x41, 0x5F); /* pop r15 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x5E); /* pop r14 */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x5D); /* pop r13 */ + if (callee_regs_used[0]) + EMIT1(0x5B); /* pop rbx */ + *pprog = prog; +} /* - * Emit x86-64 prologue code for BPF program and check its size. - * bpf_tail_call helper will skip it while jumping into another program + * Emit x86-64 prologue code for BPF program. + * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes + * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, + bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; int cnt = X86_PATCH_SIZE; @@ -238,19 +272,19 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) */ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); prog += cnt; + if (!ebpf_from_cbpf) { + if (tail_call_reachable && !is_subprog) + EMIT2(0x31, 0xC0); /* xor eax, eax */ + else + EMIT2(0x66, 0x90); /* nop2 */ + } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ - EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); - EMIT1(0x53); /* push rbx */ - EMIT2(0x41, 0x55); /* push r13 */ - EMIT2(0x41, 0x56); /* push r14 */ - EMIT2(0x41, 0x57); /* push r15 */ - if (!ebpf_from_cbpf) { - /* zero init tail_call_cnt */ - EMIT2(0x6a, 0x00); - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); - } + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); + if (tail_call_reachable) + EMIT1(0x50); /* push rax */ *pprog = prog; } @@ -314,13 +348,14 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, mutex_lock(&text_mutex); if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; + ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { if (text_live) text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); else memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; } - ret = 0; out: mutex_unlock(&text_mutex); return ret; @@ -337,6 +372,22 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } +static int get_pop_bytes(bool *callee_regs_used) +{ + int bytes = 0; + + if (callee_regs_used[3]) + bytes += 2; + if (callee_regs_used[2]) + bytes += 2; + if (callee_regs_used[1]) + bytes += 2; + if (callee_regs_used[0]) + bytes += 1; + + return bytes; +} + /* * Generate the following code: * @@ -351,12 +402,32 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, + u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; - int label1, label2, label3; + int pop_bytes = 0; + int off1 = 42; + int off2 = 31; + int off3 = 9; int cnt = 0; + /* count the additional bytes used for popping callee regs from stack + * that need to be taken into account for each of the offsets that + * are used for bailing out of the tail call + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + off2 += pop_bytes; + off3 += pop_bytes; + + if (stack_depth) { + off1 += 7; + off2 += 7; + off3 += 7; + } + /* * rdi - pointer to ctx * rsi - pointer to bpf_array @@ -370,72 +441,112 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ +#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ - label1 = cnt; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) +#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ - label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); /* * if (prog == NULL) * goto out; */ - EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ -#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ +#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ - label3 = cnt; - /* goto *(prog->bpf_func + prologue_size); */ - EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ - offsetof(struct bpf_prog, bpf_func)); - EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + + EMIT1(0x58); /* pop rax */ + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */ + round_up(stack_depth, 8)); + /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */ + EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */ + X86_TAIL_CALL_OFFSET); /* - * Wow we're ready to jump into next BPF program + * Now we're ready to jump into next BPF program * rdi == ctx (1st arg) - * rax == prog->bpf_func + prologue_size + * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RAX_BPF_JIT(); + RETPOLINE_RCX_BPF_JIT(); /* out: */ - BUILD_BUG_ON(cnt - label1 != OFFSET1); - BUILD_BUG_ON(cnt - label2 != OFFSET2); - BUILD_BUG_ON(cnt - label3 != OFFSET3); *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image) + u8 **pprog, int addr, u8 *image, + bool *callee_regs_used, u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; + int pop_bytes = 0; + int off1 = 20; + int poke_off; int cnt = 0; + /* count the additional bytes used for popping callee regs to stack + * that need to be taken into account for jump offset that is used for + * bailing out from of the tail call when limit is reached + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + + /* + * total bytes for: + * - nop5/ jmpq $off + * - pop callee regs + * - sub rsp, $val if depth > 0 + * - pop rax + */ + poke_off = X86_PATCH_SIZE + pop_bytes + 1; + if (stack_depth) { + poke_off += 7; + off1 += 7; + } + /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, 14); /* ja out */ + EMIT2(X86_JA, off1); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->ip = image + (addr - X86_PATCH_SIZE); - poke->adj_off = PROLOGUE_SIZE; + poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->adj_off = X86_TAIL_CALL_OFFSET; + poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; + + emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, + poke->tailcall_bypass); + + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + EMIT1(0x58); /* pop rax */ + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; @@ -453,7 +564,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; - WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -464,18 +575,25 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) if (target) { /* Plain memcpy is used when image is not live yet * and still not locked as read-only. Once poke - * location is active (poke->ip_stable), any parallel - * bpf_arch_text_poke() might occur still on the - * read-write image until we finally locked it as - * read-only. Both modifications on the given image - * are under text_mutex to avoid interference. + * location is active (poke->tailcall_target_stable), + * any parallel bpf_arch_text_poke() might occur + * still on the read-write image until we finally + * locked it as read-only. Both modifications on + * the given image are under text_mutex to avoid + * interference. */ - ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + (u8 *)poke->tailcall_target + + X86_PATCH_SIZE, NULL, false); + BUG_ON(ret < 0); } - WRITE_ONCE(poke->ip_stable, true); + WRITE_ONCE(poke->tailcall_target_stable, true); mutex_unlock(&array->aux->poke_mutex); } } @@ -652,19 +770,49 @@ static bool ex_handler_bpf(const struct exception_table_entry *x, return true; } +static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, + bool *regs_used, bool *tail_call_seen) +{ + int i; + + for (i = 1; i <= insn_cnt; i++, insn++) { + if (insn->code == (BPF_JMP | BPF_TAIL_CALL)) + *tail_call_seen = true; + if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6) + regs_used[0] = true; + if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7) + regs_used[1] = true; + if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8) + regs_used[2] = true; + if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9) + regs_used[3] = true; + } +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { + bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; + bool callee_regs_used[4] = {}; int insn_cnt = bpf_prog->len; + bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + detect_reg_usage(insn, insn_cnt, callee_regs_used, + &tail_call_seen); + + /* tail call's presence in current prog implies it is reachable */ + tail_call_reachable |= tail_call_seen; + emit_prologue(&prog, bpf_prog->aux->stack_depth, - bpf_prog_was_classic(bpf_prog)); + bpf_prog_was_classic(bpf_prog), tail_call_reachable, + bpf_prog->aux->func_idx != 0); + push_callee_regs(&prog, callee_regs_used); addrs[0] = prog - temp; for (i = 1; i <= insn_cnt; i++, insn++) { @@ -1102,16 +1250,27 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) - return -EINVAL; + if (tail_call_reachable) { + EMIT3_off32(0x48, 0x8B, 0x85, + -(bpf_prog->aux->stack_depth + 8)); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + return -EINVAL; + } else { + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + return -EINVAL; + } break; case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image); + &prog, addrs[i], image, + callee_regs_used, + bpf_prog->aux->stack_depth); else - emit_bpf_tail_call_indirect(&prog); + emit_bpf_tail_call_indirect(&prog, + callee_regs_used, + bpf_prog->aux->stack_depth); break; /* cond jump */ @@ -1294,12 +1453,7 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - if (!bpf_prog_was_classic(bpf_prog)) - EMIT1(0x5B); /* get rid of tail_call_cnt */ - EMIT2(0x41, 0x5F); /* pop r15 */ - EMIT2(0x41, 0x5E); /* pop r14 */ - EMIT2(0x41, 0x5D); /* pop r13 */ - EMIT1(0x5B); /* pop rbx */ + pop_callee_regs(&prog, callee_regs_used); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; @@ -1379,10 +1533,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + return -EINVAL; + } else { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + } /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1402,13 +1561,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + return -EINVAL; + } else { + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, + (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } *pprog = prog; return 0; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index df1d95913d4e..3507f456fcd0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -19,6 +19,7 @@ #include <asm/smp.h> #include <asm/pci_x86.h> #include <asm/setup.h> +#include <asm/irqdomain.h> unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; @@ -633,8 +634,9 @@ static void set_dev_domain_options(struct pci_dev *pdev) int pcibios_add_device(struct pci_dev *dev) { - struct setup_data *data; struct pci_setup_rom *rom; + struct irq_domain *msidom; + struct setup_data *data; u64 pa_data; pa_data = boot_params.hdr.setup_data; @@ -661,6 +663,20 @@ int pcibios_add_device(struct pci_dev *dev) memunmap(data); } set_dev_domain_options(dev); + + /* + * Setup the initial MSI domain of the device. If the underlying + * bus has a PCI/MSI irqdomain associated use the bus domain, + * otherwise set the default domain. This ensures that special irq + * domains e.g. VMD are preserved. The default ensures initial + * operation if irq remapping is not active. If irq remapping is + * active it will overwrite the domain pointer when the device is + * associated to a remapping domain. + */ + msidom = dev_get_msi_domain(&dev->bus->dev); + if (!msidom) + msidom = x86_pci_msi_default_domain; + dev_set_msi_domain(&dev->dev, msidom); return 0; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b8c9a5b87f37..0a0e168be1cb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -587,7 +587,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) { dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); - dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + dev->pme_support &= ~((PCI_PM_CAP_PME_D3hot | PCI_PM_CAP_PME_D3cold) >> PCI_PM_CAP_PME_SHIFT); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 5fc617edf108..00bfa1ebad6c 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -3,16 +3,17 @@ #include <linux/init.h> #include <asm/pci_x86.h> #include <asm/x86_init.h> +#include <asm/irqdomain.h> /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ static __init int pci_arch_init(void) { -#ifdef CONFIG_PCI_DIRECT - int type = 0; + int type; + + x86_create_pci_msi_domain(); type = pci_direct_probe(); -#endif if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); @@ -20,18 +21,16 @@ static __init int pci_arch_init(void) if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) return 0; -#ifdef CONFIG_PCI_BIOS pci_pcbios_init(); -#endif + /* * don't check for raw_pci_ops here because we want pcbios as last * fallback, yet it's needed to run first to set pcibios_last_bus * in case legacy PCI probing is used. otherwise detecting peer busses * fails. */ -#ifdef CONFIG_PCI_DIRECT pci_direct_init(type); -#endif + if (!raw_pci_ops && !raw_pci_ext_ops) printk(KERN_ERR "PCI: Fatal: No config space access function found\n"); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 00c62115f39c..24ca4ee2802f 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -33,6 +33,7 @@ #include <asm/hw_irq.h> #include <asm/io_apic.h> #include <asm/intel-mid.h> +#include <asm/acpi.h> #define PCIE_CAP_OFFSET 0x100 @@ -322,7 +323,7 @@ static void pci_d3delay_fixup(struct pci_dev *dev) */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; - dev->d3_delay = 0; + dev->d3hot_delay = 0; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index c313d784efab..5701d5ba3df4 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -15,7 +15,6 @@ #include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) -extern int swiotlb_late_init_with_default_size(size_t default_size); /* * We build a list of bus numbers that are under the ConneXt. The @@ -133,7 +132,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev) struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); struct device *dev = &pdev->dev; u32 amba_base, max_amba_addr; - int i; + int i, ret; if (!instance) return; @@ -141,7 +140,9 @@ static void sta2x11_map_ep(struct pci_dev *pdev) pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - dev->dma_pfn_offset = PFN_DOWN(-amba_base); + ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); + if (ret) + dev_err(dev, "sta2x11: could not set DMA offset\n"); dev->bus_dma_limit = max_amba_addr; pci_set_consistent_dma_mask(pdev, max_amba_addr); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 89395a5049bb..c552cd2d0632 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -157,6 +157,13 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); +struct xen_msi_ops { + int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); + void (*teardown_msi_irqs)(struct pci_dev *dev); +}; + +static struct xen_msi_ops xen_msi_ops __ro_after_init; + static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int irq, ret, i; @@ -372,28 +379,122 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); } } -#endif +#else /* CONFIG_XEN_DOM0 */ +#define xen_initdom_setup_msi_irqs NULL +#define xen_initdom_restore_msi_irqs NULL +#endif /* !CONFIG_XEN_DOM0 */ static void xen_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *msidesc; + int i; + + for_each_pci_msi_entry(msidesc, dev) { + if (msidesc->irq) { + for (i = 0; i < msidesc->nvec_used; i++) + xen_destroy_irq(msidesc->irq + i); + } + } +} + +static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) +{ + struct msi_desc *msidesc = first_pci_msi_entry(dev); - msidesc = first_pci_msi_entry(dev); if (msidesc->msi_attrib.is_msix) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); - /* Free the IRQ's and the msidesc using the generic code. */ - default_teardown_msi_irqs(dev); + xen_teardown_msi_irqs(dev); } -static void xen_teardown_msi_irq(unsigned int irq) +static int xen_msi_domain_alloc_irqs(struct irq_domain *domain, + struct device *dev, int nvec) { - xen_destroy_irq(irq); + int type; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return -EINVAL; + + if (first_msi_entry(dev)->msi_attrib.is_msix) + type = PCI_CAP_ID_MSIX; + else + type = PCI_CAP_ID_MSI; + + return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type); } -#endif +static void xen_msi_domain_free_irqs(struct irq_domain *domain, + struct device *dev) +{ + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + xen_msi_ops.teardown_msi_irqs(to_pci_dev(dev)); +} + +static struct msi_domain_ops xen_pci_msi_domain_ops = { + .domain_alloc_irqs = xen_msi_domain_alloc_irqs, + .domain_free_irqs = xen_msi_domain_free_irqs, +}; + +static struct msi_domain_info xen_pci_msi_domain_info = { + .ops = &xen_pci_msi_domain_ops, +}; + +/* + * This irq domain is a blatant violation of the irq domain design, but + * distangling XEN into real irq domains is not a job for mere mortals with + * limited XENology. But it's the least dangerous way for a mere mortal to + * get rid of the arch_*_msi_irqs() hackery in order to store the irq + * domain pointer in struct device. This irq domain wrappery allows to do + * that without breaking XEN terminally. + */ +static __init struct irq_domain *xen_create_pci_msi_domain(void) +{ + struct irq_domain *d = NULL; + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("XEN-MSI"); + if (fn) + d = msi_create_irq_domain(fn, &xen_pci_msi_domain_info, NULL); + + /* FIXME: No idea how to survive if this fails */ + BUG_ON(!d); + + return d; +} + +static __init void xen_setup_pci_msi(void) +{ + if (xen_pv_domain()) { + if (xen_initial_domain()) { + xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs; + x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; + } else { + xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs; + } + xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs; + pci_msi_ignore_mask = 1; + } else if (xen_hvm_domain()) { + xen_msi_ops.setup_msi_irqs = xen_hvm_setup_msi_irqs; + xen_msi_ops.teardown_msi_irqs = xen_teardown_msi_irqs; + } else { + WARN_ON_ONCE(1); + return; + } + + /* + * Override the PCI/MSI irq domain init function. No point + * in allocating the native domain and never use it. + */ + x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain; +} + +#else /* CONFIG_PCI_MSI */ +static inline void xen_setup_pci_msi(void) { } +#endif /* CONFIG_PCI_MSI */ int __init pci_xen_init(void) { @@ -410,17 +511,12 @@ int __init pci_xen_init(void) /* Keep ACPI out of the picture */ acpi_noirq_set(); -#ifdef CONFIG_PCI_MSI - x86_msi.setup_msi_irqs = xen_setup_msi_irqs; - x86_msi.teardown_msi_irq = xen_teardown_msi_irq; - x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs; - pci_msi_ignore_mask = 1; -#endif + xen_setup_pci_msi(); return 0; } #ifdef CONFIG_PCI_MSI -void __init xen_msi_init(void) +static void __init xen_hvm_msi_init(void) { if (!disable_apic) { /* @@ -435,9 +531,7 @@ void __init xen_msi_init(void) ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC))) return; } - - x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; - x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + xen_setup_pci_msi(); } #endif @@ -460,7 +554,7 @@ int __init pci_xen_hvm_init(void) * We need to wait until after x2apic is initialized * before we can set MSI IRQ ops. */ - x86_platform.apic_post_init = xen_msi_init; + x86_platform.apic_post_init = xen_hvm_msi_init; #endif return 0; } @@ -470,12 +564,7 @@ int __init pci_xen_initial_domain(void) { int irq; -#ifdef CONFIG_PCI_MSI - x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; - x86_msi.teardown_msi_irq = xen_teardown_msi_irq; - x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; - pci_msi_ignore_mask = 1; -#endif + xen_setup_pci_msi(); __acpi_register_gsi = acpi_register_gsi_xen; __acpi_unregister_gsi = NULL; /* diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index d37ebe6e70d7..8a26e705cb06 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -90,6 +90,9 @@ static const unsigned long * const efi_tables[] = { &efi.tpm_log, &efi.tpm_final_log, &efi_rng_seed, +#ifdef CONFIG_LOAD_UEFI_KEYS + &efi.mokvar_table, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6af4da1149ba..8f5759df7776 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,6 +47,7 @@ #include <asm/realmode.h> #include <asm/time.h> #include <asm/pgalloc.h> +#include <asm/sev-es.h> /* * We allocate runtime services regions top-down, starting from -4G, i.e. @@ -230,6 +231,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) } /* + * When SEV-ES is active, the GHCB as set by the kernel will be used + * by firmware. Create a 1:1 unencrypted mapping for each GHCB. + */ + if (sev_es_efi_map_ghcbs(pgd)) { + pr_err("Failed to create 1:1 mapping for the GHCBs!\n"); + return 1; + } + + /* * When making calls to the firmware everything needs to be 1:1 * mapped and addressable with 32-bit pointers. Map the kernel * text and allocate a new stack because we can't rely on the diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index c0a502f7e3a7..9ac7457f52a3 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -19,8 +19,8 @@ * pvh_bootparams and pvh_start_info need to live in the data segment since * they are used after startup_{32|64}, which clear .bss, are invoked. */ -struct boot_params pvh_bootparams __attribute__((section(".data"))); -struct hvm_start_info pvh_start_info __attribute__((section(".data"))); +struct boot_params pvh_bootparams __section(".data"); +struct hvm_start_info pvh_start_info __section(".data"); unsigned int pvh_start_info_sz = sizeof(pvh_start_info); diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile index a3693c829e2e..224ff0504890 100644 --- a/arch/x86/platform/uv/Makefile +++ b/arch/x86/platform/uv/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o +obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index a2f447dffea6..54511eaccf4d 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -2,8 +2,9 @@ /* * BIOS run time interface routines. * - * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson <rja@sgi.com> + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2007-2017 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/efi.h> @@ -170,16 +171,27 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) (u64)decode, (u64)domain, (u64)bus, 0, 0); } -int uv_bios_init(void) +unsigned long get_uv_systab_phys(bool msg) { - uv_systab = NULL; if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || !uv_systab_phys || efi_runtime_disabled()) { - pr_crit("UV: UVsystab: missing\n"); - return -EEXIST; + if (msg) + pr_crit("UV: UVsystab: missing\n"); + return 0; } + return uv_systab_phys; +} + +int uv_bios_init(void) +{ + unsigned long uv_systab_phys_addr; + + uv_systab = NULL; + uv_systab_phys_addr = get_uv_systab_phys(1); + if (!uv_systab_phys_addr) + return -EEXIST; - uv_systab = ioremap(uv_systab_phys, sizeof(struct uv_systab)); + uv_systab = ioremap(uv_systab_phys_addr, sizeof(struct uv_systab)); if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { pr_err("UV: UVsystab: bad signature!\n"); iounmap(uv_systab); @@ -191,7 +203,7 @@ int uv_bios_init(void) int size = uv_systab->size; iounmap(uv_systab); - uv_systab = ioremap(uv_systab_phys, size); + uv_systab = ioremap(uv_systab_phys_addr, size); if (!uv_systab) { pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return -EFAULT; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c deleted file mode 100644 index 62ea907668f8..000000000000 --- a/arch/x86/platform/uv/tlb_uv.c +++ /dev/null @@ -1,2097 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SGI UltraViolet TLB flush routines. - * - * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI. - */ -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/delay.h> - -#include <asm/mmu_context.h> -#include <asm/uv/uv.h> -#include <asm/uv/uv_mmrs.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_bau.h> -#include <asm/apic.h> -#include <asm/tsc.h> -#include <asm/irq_vectors.h> -#include <asm/timer.h> - -static struct bau_operations ops __ro_after_init; - -static int timeout_us; -static bool nobau = true; -static int nobau_perm; - -/* tunables: */ -static int max_concurr = MAX_BAU_CONCURRENT; -static int max_concurr_const = MAX_BAU_CONCURRENT; -static int plugged_delay = PLUGGED_DELAY; -static int plugsb4reset = PLUGSB4RESET; -static int giveup_limit = GIVEUP_LIMIT; -static int timeoutsb4reset = TIMEOUTSB4RESET; -static int ipi_reset_limit = IPI_RESET_LIMIT; -static int complete_threshold = COMPLETE_THRESHOLD; -static int congested_respns_us = CONGESTED_RESPONSE_US; -static int congested_reps = CONGESTED_REPS; -static int disabled_period = DISABLED_PERIOD; - -static struct tunables tunables[] = { - {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ - {&plugged_delay, PLUGGED_DELAY}, - {&plugsb4reset, PLUGSB4RESET}, - {&timeoutsb4reset, TIMEOUTSB4RESET}, - {&ipi_reset_limit, IPI_RESET_LIMIT}, - {&complete_threshold, COMPLETE_THRESHOLD}, - {&congested_respns_us, CONGESTED_RESPONSE_US}, - {&congested_reps, CONGESTED_REPS}, - {&disabled_period, DISABLED_PERIOD}, - {&giveup_limit, GIVEUP_LIMIT} -}; - -static struct dentry *tunables_dir; - -/* these correspond to the statistics printed by ptc_seq_show() */ -static char *stat_description[] = { - "sent: number of shootdown messages sent", - "stime: time spent sending messages", - "numuvhubs: number of hubs targeted with shootdown", - "numuvhubs16: number times 16 or more hubs targeted", - "numuvhubs8: number times 8 or more hubs targeted", - "numuvhubs4: number times 4 or more hubs targeted", - "numuvhubs2: number times 2 or more hubs targeted", - "numuvhubs1: number times 1 hub targeted", - "numcpus: number of cpus targeted with shootdown", - "dto: number of destination timeouts", - "retries: destination timeout retries sent", - "rok: : destination timeouts successfully retried", - "resetp: ipi-style resource resets for plugs", - "resett: ipi-style resource resets for timeouts", - "giveup: fall-backs to ipi-style shootdowns", - "sto: number of source timeouts", - "bz: number of stay-busy's", - "throt: number times spun in throttle", - "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE", - "recv: shootdown messages received", - "rtime: time spent processing messages", - "all: shootdown all-tlb messages", - "one: shootdown one-tlb messages", - "mult: interrupts that found multiple messages", - "none: interrupts that found no messages", - "retry: number of retry messages processed", - "canc: number messages canceled by retries", - "nocan: number retries that found nothing to cancel", - "reset: number of ipi-style reset requests processed", - "rcan: number messages canceled by reset requests", - "disable: number times use of the BAU was disabled", - "enable: number times use of the BAU was re-enabled" -}; - -static int __init setup_bau(char *arg) -{ - int result; - - if (!arg) - return -EINVAL; - - result = strtobool(arg, &nobau); - if (result) - return result; - - /* we need to flip the logic here, so that bau=y sets nobau to false */ - nobau = !nobau; - - if (!nobau) - pr_info("UV BAU Enabled\n"); - else - pr_info("UV BAU Disabled\n"); - - return 0; -} -early_param("bau", setup_bau); - -/* base pnode in this partition */ -static int uv_base_pnode __read_mostly; - -static DEFINE_PER_CPU(struct ptc_stats, ptcstats); -static DEFINE_PER_CPU(struct bau_control, bau_control); -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); - -static void -set_bau_on(void) -{ - int cpu; - struct bau_control *bcp; - - if (nobau_perm) { - pr_info("BAU not initialized; cannot be turned on\n"); - return; - } - nobau = false; - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->nobau = false; - } - pr_info("BAU turned on\n"); - return; -} - -static void -set_bau_off(void) -{ - int cpu; - struct bau_control *bcp; - - nobau = true; - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->nobau = true; - } - pr_info("BAU turned off\n"); - return; -} - -/* - * Determine the first node on a uvhub. 'Nodes' are used for kernel - * memory allocation. - */ -static int __init uvhub_to_first_node(int uvhub) -{ - int node, b; - - for_each_online_node(node) { - b = uv_node_to_blade_id(node); - if (uvhub == b) - return node; - } - return -1; -} - -/* - * Determine the apicid of the first cpu on a uvhub. - */ -static int __init uvhub_to_first_apicid(int uvhub) -{ - int cpu; - - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) - return per_cpu(x86_cpu_to_apicid, cpu); - return -1; -} - -/* - * Free a software acknowledge hardware resource by clearing its Pending - * bit. This will return a reply to the sender. - * If the message has timed out, a reply has already been sent by the - * hardware but the resource has not been released. In that case our - * clear of the Timeout bit (as well) will free the resource. No reply will - * be sent (the hardware will only do one reply per message). - */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, - int do_acknowledge) -{ - unsigned long dw; - struct bau_pq_entry *msg; - - msg = mdp->msg; - if (!msg->canceled && do_acknowledge) { - dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; - ops.write_l_sw_ack(dw); - } - msg->replied_to = 1; - msg->swack_vec = 0; -} - -/* - * Process the receipt of a RETRY message - */ -static void bau_process_retry_msg(struct msg_desc *mdp, - struct bau_control *bcp) -{ - int i; - int cancel_count = 0; - unsigned long msg_res; - unsigned long mmr = 0; - struct bau_pq_entry *msg = mdp->msg; - struct bau_pq_entry *msg2; - struct ptc_stats *stat = bcp->statp; - - stat->d_retries++; - /* - * cancel any message from msg+1 to the retry itself - */ - for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { - if (msg2 > mdp->queue_last) - msg2 = mdp->queue_first; - if (msg2 == msg) - break; - - /* same conditions for cancellation as do_reset */ - if ((msg2->replied_to == 0) && (msg2->canceled == 0) && - (msg2->swack_vec) && ((msg2->swack_vec & - msg->swack_vec) == 0) && - (msg2->sending_cpu == msg->sending_cpu) && - (msg2->msg_type != MSG_NOOP)) { - mmr = ops.read_l_sw_ack(); - msg_res = msg2->swack_vec; - /* - * This is a message retry; clear the resources held - * by the previous message only if they timed out. - * If it has not timed out we have an unexpected - * situation to report. - */ - if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { - unsigned long mr; - /* - * Is the resource timed out? - * Make everyone ignore the cancelled message. - */ - msg2->canceled = 1; - stat->d_canceled++; - cancel_count++; - mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; - ops.write_l_sw_ack(mr); - } - } - } - if (!cancel_count) - stat->d_nocanceled++; -} - -/* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. - */ -static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, - int do_acknowledge) -{ - short socket_ack_count = 0; - short *sp; - struct atomic_short *asp; - struct ptc_stats *stat = bcp->statp; - struct bau_pq_entry *msg = mdp->msg; - struct bau_control *smaster = bcp->socket_master; - - /* - * This must be a normal message, or retry of a normal message - */ - if (msg->address == TLB_FLUSH_ALL) { - flush_tlb_local(); - stat->d_alltlb++; - } else { - flush_tlb_one_user(msg->address); - stat->d_onetlb++; - } - stat->d_requestee++; - - /* - * One cpu on each uvhub has the additional job on a RETRY - * of releasing the resource held by the message that is - * being retried. That message is identified by sending - * cpu number. - */ - if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) - bau_process_retry_msg(mdp, bcp); - - /* - * This is a swack message, so we have to reply to it. - * Count each responding cpu on the socket. This avoids - * pinging the count's cache line back and forth between - * the sockets. - */ - sp = &smaster->socket_acknowledge_count[mdp->msg_slot]; - asp = (struct atomic_short *)sp; - socket_ack_count = atom_asr(1, asp); - if (socket_ack_count == bcp->cpus_in_socket) { - int msg_ack_count; - /* - * Both sockets dump their completed count total into - * the message's count. - */ - *sp = 0; - asp = (struct atomic_short *)&msg->acknowledge_count; - msg_ack_count = atom_asr(socket_ack_count, asp); - - if (msg_ack_count == bcp->cpus_in_uvhub) { - /* - * All cpus in uvhub saw it; reply - * (unless we are in the UV2 workaround) - */ - reply_to_message(mdp, bcp, do_acknowledge); - } - } - - return; -} - -/* - * Determine the first cpu on a pnode. - */ -static int pnode_to_first_cpu(int pnode, struct bau_control *smaster) -{ - int cpu; - struct hub_and_pnode *hpp; - - for_each_present_cpu(cpu) { - hpp = &smaster->thp[cpu]; - if (pnode == hpp->pnode) - return cpu; - } - return -1; -} - -/* - * Last resort when we get a large number of destination timeouts is - * to clear resources held by a given cpu. - * Do this with IPI so that all messages in the BAU message queue - * can be identified by their nonzero swack_vec field. - * - * This is entered for a single cpu on the uvhub. - * The sender want's this uvhub to free a specific message's - * swack resources. - */ -static void do_reset(void *ptr) -{ - int i; - struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id()); - struct reset_args *rap = (struct reset_args *)ptr; - struct bau_pq_entry *msg; - struct ptc_stats *stat = bcp->statp; - - stat->d_resets++; - /* - * We're looking for the given sender, and - * will free its swack resource. - * If all cpu's finally responded after the timeout, its - * message 'replied_to' was set. - */ - for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { - unsigned long msg_res; - /* do_reset: same conditions for cancellation as - bau_process_retry_msg() */ - if ((msg->replied_to == 0) && - (msg->canceled == 0) && - (msg->sending_cpu == rap->sender) && - (msg->swack_vec) && - (msg->msg_type != MSG_NOOP)) { - unsigned long mmr; - unsigned long mr; - /* - * make everyone else ignore this message - */ - msg->canceled = 1; - /* - * only reset the resource if it is still pending - */ - mmr = ops.read_l_sw_ack(); - msg_res = msg->swack_vec; - mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; - if (mmr & msg_res) { - stat->d_rcanceled++; - ops.write_l_sw_ack(mr); - } - } - } - return; -} - -/* - * Use IPI to get all target uvhubs to release resources held by - * a given sending cpu number. - */ -static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) -{ - int pnode; - int apnode; - int maskbits; - int sender = bcp->cpu; - cpumask_t *mask = bcp->uvhub_master->cpumask; - struct bau_control *smaster = bcp->socket_master; - struct reset_args reset_args; - - reset_args.sender = sender; - cpumask_clear(mask); - /* find a single cpu for each uvhub in this distribution mask */ - maskbits = sizeof(struct pnmask) * BITSPERBYTE; - /* each bit is a pnode relative to the partition base pnode */ - for (pnode = 0; pnode < maskbits; pnode++) { - int cpu; - if (!bau_uvhub_isset(pnode, distribution)) - continue; - apnode = pnode + bcp->partition_base_pnode; - cpu = pnode_to_first_cpu(apnode, smaster); - cpumask_set_cpu(cpu, mask); - } - - /* IPI all cpus; preemption is already disabled */ - smp_call_function_many(mask, do_reset, (void *)&reset_args, 1); - return; -} - -/* - * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative - * number, not an absolute. It converts a duration in cycles to a duration in - * ns. - */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - struct cyc2ns_data data; - unsigned long long ns; - - cyc2ns_read_begin(&data); - ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); - - return ns; -} - -/* - * The reverse of the above; converts a duration in ns to a duration in cycles. - */ -static inline unsigned long long ns_2_cycles(unsigned long long ns) -{ - struct cyc2ns_data data; - unsigned long long cyc; - - cyc2ns_read_begin(&data); - cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul; - cyc2ns_read_end(); - - return cyc; -} - -static inline unsigned long cycles_2_us(unsigned long long cyc) -{ - return cycles_2_ns(cyc) / NSEC_PER_USEC; -} - -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - return ns_2_cycles(sec * NSEC_PER_SEC); -} - -static inline unsigned long long usec_2_cycles(unsigned long usec) -{ - return ns_2_cycles(usec * NSEC_PER_USEC); -} - -/* - * wait for all cpus on this hub to finish their sends and go quiet - * leaves uvhub_quiesce set so that no new broadcasts are started by - * bau_flush_send_and_wait() - */ -static inline void quiesce_local_uvhub(struct bau_control *hmaster) -{ - atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce); -} - -/* - * mark this quiet-requestor as done - */ -static inline void end_uvhub_quiesce(struct bau_control *hmaster) -{ - atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); -} - -/* - * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. - * But not currently used. - */ -static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc) -{ - return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; -} - -/* - * Entered when a bau descriptor has gone into a permanent busy wait because - * of a hardware bug. - * Workaround the bug. - */ -static int handle_uv2_busy(struct bau_control *bcp) -{ - struct ptc_stats *stat = bcp->statp; - - stat->s_uv2_wars++; - bcp->busy = 1; - return FLUSH_GIVEUP; -} - -static int uv2_3_wait_completion(struct bau_desc *bau_desc, - struct bau_control *bcp, long try) -{ - unsigned long descriptor_stat; - cycles_t ttm; - u64 mmr_offset = bcp->status_mmr; - int right_shift = bcp->status_index; - int desc = bcp->uvhub_cpu; - long busy_reps = 0; - struct ptc_stats *stat = bcp->statp; - - descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); - - /* spin on the status MMR, waiting for it to go idle */ - while (descriptor_stat != UV2H_DESC_IDLE) { - if (descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) { - /* - * A h/w bug on the destination side may - * have prevented the message being marked - * pending, thus it doesn't get replied to - * and gets continually nacked until it times - * out with a SOURCE_TIMEOUT. - */ - stat->s_stimeout++; - return FLUSH_GIVEUP; - } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { - ttm = get_cycles(); - - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - * Without using the extended status we have to - * deduce from the short time that this was a - * strong nack. - */ - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { - bcp->conseccompletes = 0; - stat->s_plugged++; - /* FLUSH_RETRY_PLUGGED causes hang on boot */ - return FLUSH_GIVEUP; - } - stat->s_dtimeout++; - bcp->conseccompletes = 0; - /* FLUSH_RETRY_TIMEOUT causes hang on boot */ - return FLUSH_GIVEUP; - } else { - busy_reps++; - if (busy_reps > 1000000) { - /* not to hammer on the clock */ - busy_reps = 0; - ttm = get_cycles(); - if ((ttm - bcp->send_message) > bcp->timeout_interval) - return handle_uv2_busy(bcp); - } - /* - * descriptor_stat is still BUSY - */ - cpu_relax(); - } - descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); - } - bcp->conseccompletes++; - return FLUSH_COMPLETE; -} - -/* - * Returns the status of current BAU message for cpu desc as a bit field - * [Error][Busy][Aux] - */ -static u64 read_status(u64 status_mmr, int index, int desc) -{ - u64 stat; - - stat = ((read_lmmr(status_mmr) >> index) & UV_ACT_STATUS_MASK) << 1; - stat |= (read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_2) >> desc) & 0x1; - - return stat; -} - -static int uv4_wait_completion(struct bau_desc *bau_desc, - struct bau_control *bcp, long try) -{ - struct ptc_stats *stat = bcp->statp; - u64 descriptor_stat; - u64 mmr = bcp->status_mmr; - int index = bcp->status_index; - int desc = bcp->uvhub_cpu; - - descriptor_stat = read_status(mmr, index, desc); - - /* spin on the status MMR, waiting for it to go idle */ - while (descriptor_stat != UV2H_DESC_IDLE) { - switch (descriptor_stat) { - case UV2H_DESC_SOURCE_TIMEOUT: - stat->s_stimeout++; - return FLUSH_GIVEUP; - - case UV2H_DESC_DEST_TIMEOUT: - stat->s_dtimeout++; - bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; - - case UV2H_DESC_DEST_STRONG_NACK: - stat->s_plugged++; - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - - case UV2H_DESC_DEST_PUT_ERR: - bcp->conseccompletes = 0; - return FLUSH_GIVEUP; - - default: - /* descriptor_stat is still BUSY */ - cpu_relax(); - } - descriptor_stat = read_status(mmr, index, desc); - } - bcp->conseccompletes++; - return FLUSH_COMPLETE; -} - -/* - * Our retries are blocked by all destination sw ack resources being - * in use, and a timeout is pending. In that case hardware immediately - * returns the ERROR that looks like a destination timeout. - */ -static void destination_plugged(struct bau_desc *bau_desc, - struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) -{ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - - quiesce_local_uvhub(hmaster); - - spin_lock(&hmaster->queue_lock); - reset_with_ipi(&bau_desc->distribution, bcp); - spin_unlock(&hmaster->queue_lock); - - end_uvhub_quiesce(hmaster); - - bcp->ipi_attempts++; - stat->s_resets_plug++; - } -} - -static void destination_timeout(struct bau_desc *bau_desc, - struct bau_control *bcp, struct bau_control *hmaster, - struct ptc_stats *stat) -{ - hmaster->max_concurr = 1; - bcp->timeout_tries++; - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - - quiesce_local_uvhub(hmaster); - - spin_lock(&hmaster->queue_lock); - reset_with_ipi(&bau_desc->distribution, bcp); - spin_unlock(&hmaster->queue_lock); - - end_uvhub_quiesce(hmaster); - - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } -} - -/* - * Stop all cpus on a uvhub from using the BAU for a period of time. - * This is reversed by check_enable. - */ -static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat) -{ - int tcpu; - struct bau_control *tbcp; - struct bau_control *hmaster; - cycles_t tm1; - - hmaster = bcp->uvhub_master; - spin_lock(&hmaster->disable_lock); - if (!bcp->baudisabled) { - stat->s_bau_disabled++; - tm1 = get_cycles(); - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - if (tbcp->uvhub_master == hmaster) { - tbcp->baudisabled = 1; - tbcp->set_bau_on_time = - tm1 + bcp->disabled_period; - } - } - } - spin_unlock(&hmaster->disable_lock); -} - -static void count_max_concurr(int stat, struct bau_control *bcp, - struct bau_control *hmaster) -{ - bcp->plugged_tries = 0; - bcp->timeout_tries = 0; - if (stat != FLUSH_COMPLETE) - return; - if (bcp->conseccompletes <= bcp->complete_threshold) - return; - if (hmaster->max_concurr >= hmaster->max_concurr_const) - return; - hmaster->max_concurr++; -} - -static void record_send_stats(cycles_t time1, cycles_t time2, - struct bau_control *bcp, struct ptc_stats *stat, - int completion_status, int try) -{ - cycles_t elapsed; - - if (time2 > time1) { - elapsed = time2 - time1; - stat->s_time += elapsed; - - if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { - bcp->period_requests++; - bcp->period_time += elapsed; - if ((elapsed > usec_2_cycles(bcp->cong_response_us)) && - (bcp->period_requests > bcp->cong_reps) && - ((bcp->period_time / bcp->period_requests) > - usec_2_cycles(bcp->cong_response_us))) { - stat->s_congested++; - disable_for_period(bcp, stat); - } - } - } else - stat->s_requestor--; - - if (completion_status == FLUSH_COMPLETE && try > 1) - stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) { - stat->s_giveup++; - if (get_cycles() > bcp->period_end) - bcp->period_giveups = 0; - bcp->period_giveups++; - if (bcp->period_giveups == 1) - bcp->period_end = get_cycles() + bcp->disabled_period; - if (bcp->period_giveups > bcp->giveup_limit) { - disable_for_period(bcp, stat); - stat->s_giveuplimit++; - } - } -} - -/* - * Handle the completion status of a message send. - */ -static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, - struct bau_control *bcp, struct bau_control *hmaster, - struct ptc_stats *stat) -{ - if (completion_status == FLUSH_RETRY_PLUGGED) - destination_plugged(bau_desc, bcp, hmaster, stat); - else if (completion_status == FLUSH_RETRY_TIMEOUT) - destination_timeout(bau_desc, bcp, hmaster, stat); -} - -/* - * Send a broadcast and wait for it to complete. - * - * The flush_mask contains the cpus the broadcast is to be sent to including - * cpus that are on the local uvhub. - * - * Returns 0 if all flushing represented in the mask was done. - * Returns 1 if it gives up entirely and the original cpu mask is to be - * returned to the kernel. - */ -static int uv_flush_send_and_wait(struct cpumask *flush_mask, - struct bau_control *bcp, - struct bau_desc *bau_desc) -{ - int seq_number = 0; - int completion_stat = 0; - long try = 0; - unsigned long index; - cycles_t time1; - cycles_t time2; - struct ptc_stats *stat = bcp->statp; - struct bau_control *hmaster = bcp->uvhub_master; - struct uv2_3_bau_msg_header *uv2_3_hdr = NULL; - - while (hmaster->uvhub_quiesce) - cpu_relax(); - - time1 = get_cycles(); - uv2_3_hdr = &bau_desc->header.uv2_3_hdr; - - do { - if (try == 0) { - uv2_3_hdr->msg_type = MSG_REGULAR; - seq_number = bcp->message_number++; - } else { - uv2_3_hdr->msg_type = MSG_RETRY; - stat->s_retry_messages++; - } - - uv2_3_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; - bcp->send_message = get_cycles(); - - write_mmr_activation(index); - - try++; - completion_stat = ops.wait_completion(bau_desc, bcp, try); - - handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); - - if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { - bcp->ipi_attempts = 0; - stat->s_overipilimit++; - completion_stat = FLUSH_GIVEUP; - break; - } - cpu_relax(); - } while ((completion_stat == FLUSH_RETRY_PLUGGED) || - (completion_stat == FLUSH_RETRY_TIMEOUT)); - - time2 = get_cycles(); - - count_max_concurr(completion_stat, bcp, hmaster); - - while (hmaster->uvhub_quiesce) - cpu_relax(); - - atomic_dec(&hmaster->active_descriptor_count); - - record_send_stats(time1, time2, bcp, stat, completion_stat, try); - - if (completion_stat == FLUSH_GIVEUP) - /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ - return 1; - return 0; -} - -/* - * The BAU is disabled for this uvhub. When the disabled time period has - * expired re-enable it. - * Return 0 if it is re-enabled for all cpus on this uvhub. - */ -static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) -{ - int tcpu; - struct bau_control *tbcp; - struct bau_control *hmaster; - - hmaster = bcp->uvhub_master; - spin_lock(&hmaster->disable_lock); - if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { - stat->s_bau_reenabled++; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - if (tbcp->uvhub_master == hmaster) { - tbcp->baudisabled = 0; - tbcp->period_requests = 0; - tbcp->period_time = 0; - tbcp->period_giveups = 0; - } - } - spin_unlock(&hmaster->disable_lock); - return 0; - } - spin_unlock(&hmaster->disable_lock); - return -1; -} - -static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs, - int remotes, struct bau_desc *bau_desc) -{ - stat->s_requestor++; - stat->s_ntargcpu += remotes + locals; - stat->s_ntargremotes += remotes; - stat->s_ntarglocals += locals; - - /* uvhub statistics */ - hubs = bau_uvhub_weight(&bau_desc->distribution); - if (locals) { - stat->s_ntarglocaluvhub++; - stat->s_ntargremoteuvhub += (hubs - 1); - } else - stat->s_ntargremoteuvhub += hubs; - - stat->s_ntarguvhub += hubs; - - if (hubs >= 16) - stat->s_ntarguvhub16++; - else if (hubs >= 8) - stat->s_ntarguvhub8++; - else if (hubs >= 4) - stat->s_ntarguvhub4++; - else if (hubs >= 2) - stat->s_ntarguvhub2++; - else - stat->s_ntarguvhub1++; -} - -/* - * Translate a cpu mask to the uvhub distribution mask in the BAU - * activation descriptor. - */ -static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, - struct bau_desc *bau_desc, int *localsp, int *remotesp) -{ - int cpu; - int pnode; - int cnt = 0; - struct hub_and_pnode *hpp; - - for_each_cpu(cpu, flush_mask) { - /* - * The distribution vector is a bit map of pnodes, relative - * to the partition base pnode (and the partition base nasid - * in the header). - * Translate cpu to pnode and hub using a local memory array. - */ - hpp = &bcp->socket_master->thp[cpu]; - pnode = hpp->pnode - bcp->partition_base_pnode; - bau_uvhub_set(pnode, &bau_desc->distribution); - cnt++; - if (hpp->uvhub == bcp->uvhub) - (*localsp)++; - else - (*remotesp)++; - } - if (!cnt) - return 1; - return 0; -} - -/* - * globally purge translation cache of a virtual address or all TLB's - * @cpumask: mask of all cpu's in which the address is to be removed - * @mm: mm_struct containing virtual address range - * @start: start virtual address to be removed from TLB - * @end: end virtual address to be remove from TLB - * @cpu: the current cpu - * - * This is the entry point for initiating any UV global TLB shootdown. - * - * Purges the translation caches of all specified processors of the given - * virtual address, or purges all TLB's on specified processors. - * - * The caller has derived the cpumask from the mm_struct. This function - * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) - * - * The cpumask is converted into a uvhubmask of the uvhubs containing - * those cpus. - * - * Note that this function should be called with preemption disabled. - * - * Returns NULL if all remote flushing was done. - * Returns pointer to cpumask if some remote flushing remains to be - * done. The returned pointer is valid till preemption is re-enabled. - */ -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ - unsigned int cpu = smp_processor_id(); - int locals = 0, remotes = 0, hubs = 0; - struct bau_desc *bau_desc; - struct cpumask *flush_mask; - struct ptc_stats *stat; - struct bau_control *bcp; - unsigned long descriptor_status, status, address; - - bcp = &per_cpu(bau_control, cpu); - - if (bcp->nobau) - return cpumask; - - stat = bcp->statp; - stat->s_enters++; - - if (bcp->busy) { - descriptor_status = - read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); - status = ((descriptor_status >> (bcp->uvhub_cpu * - UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; - if (status == UV2H_DESC_BUSY) - return cpumask; - bcp->busy = 0; - } - - /* bau was disabled due to slow response */ - if (bcp->baudisabled) { - if (check_enable(bcp, stat)) { - stat->s_ipifordisabled++; - return cpumask; - } - } - - /* - * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. All cpus are converted to uvhubs and copied to the - * activation descriptor. - */ - flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* don't actually do a shootdown of the local cpu */ - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); - - if (cpumask_test_cpu(cpu, cpumask)) - stat->s_ntargself++; - - bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) - return NULL; - - record_send_statistics(stat, locals, hubs, remotes, bau_desc); - - if (!info->end || (info->end - info->start) <= PAGE_SIZE) - address = info->start; - else - address = TLB_FLUSH_ALL; - - switch (bcp->uvhub_version) { - case UV_BAU_V2: - case UV_BAU_V3: - bau_desc->payload.uv2_3.address = address; - bau_desc->payload.uv2_3.sending_cpu = cpu; - break; - case UV_BAU_V4: - bau_desc->payload.uv4.address = address; - bau_desc->payload.uv4.sending_cpu = cpu; - bau_desc->payload.uv4.qualifier = BAU_DESC_QUALIFIER; - break; - } - - /* - * uv_flush_send_and_wait returns 0 if all cpu's were messaged, - * or 1 if it gave up and the original cpumask should be returned. - */ - if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc)) - return NULL; - else - return cpumask; -} - -/* - * Search the message queue for any 'other' unprocessed message with the - * same software acknowledge resource bit vector as the 'msg' message. - */ -static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp) -{ - struct bau_pq_entry *msg_next = msg + 1; - unsigned char swack_vec = msg->swack_vec; - - if (msg_next > bcp->queue_last) - msg_next = bcp->queue_first; - while (msg_next != msg) { - if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && - (msg_next->swack_vec == swack_vec)) - return msg_next; - msg_next++; - if (msg_next > bcp->queue_last) - msg_next = bcp->queue_first; - } - return NULL; -} - -/* - * UV2 needs to work around a bug in which an arriving message has not - * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. - * Such a message must be ignored. - */ -static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) -{ - unsigned long mmr_image; - unsigned char swack_vec; - struct bau_pq_entry *msg = mdp->msg; - struct bau_pq_entry *other_msg; - - mmr_image = ops.read_l_sw_ack(); - swack_vec = msg->swack_vec; - - if ((swack_vec & mmr_image) == 0) { - /* - * This message was assigned a swack resource, but no - * reserved acknowlegment is pending. - * The bug has prevented this message from setting the MMR. - */ - /* - * Some message has set the MMR 'pending' bit; it might have - * been another message. Look for that message. - */ - other_msg = find_another_by_swack(msg, bcp); - if (other_msg) { - /* - * There is another. Process this one but do not - * ack it. - */ - bau_process_message(mdp, bcp, 0); - /* - * Let the natural processing of that other message - * acknowledge it. Don't get the processing of sw_ack's - * out of order. - */ - return; - } - } - - /* - * Either the MMR shows this one pending a reply or there is no - * other message using this sw_ack, so it is safe to acknowledge it. - */ - bau_process_message(mdp, bcp, 1); - - return; -} - -/* - * The BAU message interrupt comes here. (registered by set_intr_gate) - * See entry_64.S - * - * We received a broadcast assist message. - * - * Interrupts are disabled; this interrupt could represent - * the receipt of several messages. - * - * All cores/threads on this hub get this interrupt. - * The last one to see it does the software ack. - * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware may timeout the s/w ack and reply ERROR) - */ -DEFINE_IDTENTRY_SYSVEC(sysvec_uv_bau_message) -{ - int count = 0; - cycles_t time_start; - struct bau_pq_entry *msg; - struct bau_control *bcp; - struct ptc_stats *stat; - struct msg_desc msgdesc; - - ack_APIC_irq(); - kvm_set_cpu_l1tf_flush_l1d(); - time_start = get_cycles(); - - bcp = &per_cpu(bau_control, smp_processor_id()); - stat = bcp->statp; - - msgdesc.queue_first = bcp->queue_first; - msgdesc.queue_last = bcp->queue_last; - - msg = bcp->bau_msg_head; - while (msg->swack_vec) { - count++; - - msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.msg = msg; - if (bcp->uvhub_version == UV_BAU_V2) - process_uv2_message(&msgdesc, bcp); - else - /* no error workaround for uv3 */ - bau_process_message(&msgdesc, bcp, 1); - - msg++; - if (msg > msgdesc.queue_last) - msg = msgdesc.queue_first; - bcp->bau_msg_head = msg; - } - stat->d_time += (get_cycles() - time_start); - if (!count) - stat->d_nomsg++; - else if (count > 1) - stat->d_multmsg++; -} - -/* - * Each target uvhub (i.e. a uvhub that has cpu's) needs to have - * shootdown message timeouts enabled. The timeout does not cause - * an interrupt, but causes an error message to be returned to - * the sender. - */ -static void __init enable_timeouts(void) -{ - int uvhub; - int nuvhubs; - int pnode; - unsigned long mmr_image; - - nuvhubs = uv_num_possible_blades(); - - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - if (!uv_blade_nr_possible_cpus(uvhub)) - continue; - - pnode = uv_blade_to_pnode(uvhub); - mmr_image = read_mmr_misc_control(pnode); - /* - * Set the timeout period and then lock it in, in three - * steps; captures and locks in the period. - * - * To program the period, the SOFT_ACK_MODE must be off. - */ - mmr_image &= ~(1L << SOFTACK_MSHIFT); - write_mmr_misc_control(pnode, mmr_image); - /* - * Set the 4-bit period. - */ - mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); - mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); - write_mmr_misc_control(pnode, mmr_image); - - mmr_image |= (1L << SOFTACK_MSHIFT); - if (is_uv2_hub()) { - /* do not touch the legacy mode bit */ - /* hw bug workaround; do not use extended status */ - mmr_image &= ~(1L << UV2_EXT_SHFT); - } else if (is_uv3_hub()) { - mmr_image &= ~(1L << PREFETCH_HINT_SHFT); - mmr_image |= (1L << SB_STATUS_SHFT); - } - write_mmr_misc_control(pnode, mmr_image); - } -} - -static void *ptc_seq_start(struct seq_file *file, loff_t *offset) -{ - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) -{ - (*offset)++; - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void ptc_seq_stop(struct seq_file *file, void *data) -{ -} - -/* - * Display the statistics thru /proc/sgi_uv/ptc_statistics - * 'data' points to the cpu number - * Note: see the descriptions in stat_description[]. - */ -static int ptc_seq_show(struct seq_file *file, void *data) -{ - struct ptc_stats *stat; - struct bau_control *bcp; - int cpu; - - cpu = *(loff_t *)data; - if (!cpu) { - seq_puts(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); - seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_puts(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); - seq_puts(file, - "rok resetp resett giveup sto bz throt disable "); - seq_puts(file, - "enable wars warshw warwaits enters ipidis plugged "); - seq_puts(file, - "ipiover glim cong swack recv rtime all one mult "); - seq_puts(file, "none retry canc nocan reset rcan\n"); - } - if (cpu < num_possible_cpus() && cpu_online(cpu)) { - bcp = &per_cpu(bau_control, cpu); - if (bcp->nobau) { - seq_printf(file, "cpu %d bau disabled\n", cpu); - return 0; - } - stat = bcp->statp; - /* source side statistics */ - seq_printf(file, - "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, bcp->nobau, stat->s_requestor, - cycles_2_us(stat->s_time), - stat->s_ntargself, stat->s_ntarglocals, - stat->s_ntargremotes, stat->s_ntargcpu, - stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, - stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld %ld ", - stat->s_ntarguvhub8, stat->s_ntarguvhub4, - stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout, stat->s_strongnacks); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", - stat->s_retry_messages, stat->s_retriesok, - stat->s_resets_plug, stat->s_resets_timeout, - stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - stat->s_bau_disabled, stat->s_bau_reenabled, - stat->s_uv2_wars, stat->s_uv2_wars_hw, - stat->s_uv2_war_waits, stat->s_enters, - stat->s_ipifordisabled, stat->s_plugged, - stat->s_overipilimit, stat->s_giveuplimit, - stat->s_congested); - - /* destination side statistics */ - seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", - ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)), - stat->d_requestee, cycles_2_us(stat->d_time), - stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, - stat->d_nomsg, stat->d_retries, stat->d_canceled, - stat->d_nocanceled, stat->d_resets, - stat->d_rcanceled); - } - return 0; -} - -/* - * Display the tunables thru debugfs - */ -static ssize_t tunables_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - char *buf; - int ret; - - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", - "max_concur plugged_delay plugsb4reset timeoutsb4reset", - "ipi_reset_limit complete_threshold congested_response_us", - "congested_reps disabled_period giveup_limit", - max_concurr, plugged_delay, plugsb4reset, - timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_respns_us, congested_reps, disabled_period, - giveup_limit); - - if (!buf) - return -ENOMEM; - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); - kfree(buf); - return ret; -} - -/* - * handle a write to /proc/sgi_uv/ptc_statistics - * -1: reset the statistics - * 0: display meaning of the statistics - */ -static ssize_t ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - int i; - int elements; - long input_arg; - char optstr[64]; - struct ptc_stats *stat; - - if (count == 0 || count > sizeof(optstr)) - return -EINVAL; - if (copy_from_user(optstr, user, count)) - return -EFAULT; - optstr[count - 1] = '\0'; - - if (!strcmp(optstr, "on")) { - set_bau_on(); - return count; - } else if (!strcmp(optstr, "off")) { - set_bau_off(); - return count; - } - - if (kstrtol(optstr, 10, &input_arg) < 0) { - pr_debug("%s is invalid\n", optstr); - return -EINVAL; - } - - if (input_arg == 0) { - elements = ARRAY_SIZE(stat_description); - pr_debug("# cpu: cpu number\n"); - pr_debug("Sender statistics:\n"); - for (i = 0; i < elements; i++) - pr_debug("%s\n", stat_description[i]); - } else if (input_arg == -1) { - for_each_present_cpu(cpu) { - stat = &per_cpu(ptcstats, cpu); - memset(stat, 0, sizeof(struct ptc_stats)); - } - } - - return count; -} - -static int local_atoi(const char *name) -{ - int val = 0; - - for (;; name++) { - switch (*name) { - case '0' ... '9': - val = 10*val+(*name-'0'); - break; - default: - return val; - } - } -} - -/* - * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables. - * Zero values reset them to defaults. - */ -static int parse_tunables_write(struct bau_control *bcp, char *instr, - int count) -{ - char *p; - char *q; - int cnt = 0; - int val; - int e = ARRAY_SIZE(tunables); - - p = instr + strspn(instr, WHITESPACE); - q = p; - for (; *p; p = q + strspn(q, WHITESPACE)) { - q = p + strcspn(p, WHITESPACE); - cnt++; - if (q == p) - break; - } - if (cnt != e) { - pr_info("bau tunable error: should be %d values\n", e); - return -EINVAL; - } - - p = instr + strspn(instr, WHITESPACE); - q = p; - for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { - q = p + strcspn(p, WHITESPACE); - val = local_atoi(p); - switch (cnt) { - case 0: - if (val == 0) { - max_concurr = MAX_BAU_CONCURRENT; - max_concurr_const = MAX_BAU_CONCURRENT; - continue; - } - if (val < 1 || val > bcp->cpus_in_uvhub) { - pr_debug( - "Error: BAU max concurrent %d is invalid\n", - val); - return -EINVAL; - } - max_concurr = val; - max_concurr_const = val; - continue; - default: - if (val == 0) - *tunables[cnt].tunp = tunables[cnt].deflt; - else - *tunables[cnt].tunp = val; - continue; - } - } - return 0; -} - -/* - * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables) - */ -static ssize_t tunables_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - int ret; - char instr[100]; - struct bau_control *bcp; - - if (count == 0 || count > sizeof(instr)-1) - return -EINVAL; - if (copy_from_user(instr, user, count)) - return -EFAULT; - - instr[count] = '\0'; - - cpu = get_cpu(); - bcp = &per_cpu(bau_control, cpu); - ret = parse_tunables_write(bcp, instr, count); - put_cpu(); - if (ret) - return ret; - - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_concurr = max_concurr; - bcp->max_concurr_const = max_concurr; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->cong_response_us = congested_respns_us; - bcp->cong_reps = congested_reps; - bcp->disabled_period = sec_2_cycles(disabled_period); - bcp->giveup_limit = giveup_limit; - } - return count; -} - -static const struct seq_operations uv_ptc_seq_ops = { - .start = ptc_seq_start, - .next = ptc_seq_next, - .stop = ptc_seq_stop, - .show = ptc_seq_show -}; - -static int ptc_proc_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &uv_ptc_seq_ops); -} - -static int tunables_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static const struct proc_ops uv_ptc_proc_ops = { - .proc_open = ptc_proc_open, - .proc_read = seq_read, - .proc_write = ptc_proc_write, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - -static const struct file_operations tunables_fops = { - .open = tunables_open, - .read = tunables_read, - .write = tunables_write, - .llseek = default_llseek, -}; - -static int __init uv_ptc_init(void) -{ - struct proc_dir_entry *proc_uv_ptc; - - if (!is_uv_system()) - return 0; - - proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, - &uv_ptc_proc_ops); - if (!proc_uv_ptc) { - pr_err("unable to create %s proc entry\n", - UV_PTC_BASENAME); - return -EINVAL; - } - - tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); - debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, tunables_dir, NULL, - &tunables_fops); - return 0; -} - -/* - * Initialize the sending side's sending buffers. - */ -static void activation_descriptor_init(int node, int pnode, int base_pnode) -{ - int i; - int cpu; - unsigned long gpa; - unsigned long m; - unsigned long n; - size_t dsize; - struct bau_desc *bau_desc; - struct bau_desc *bd2; - struct uv2_3_bau_msg_header *uv2_3_hdr; - struct bau_control *bcp; - - /* - * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC) - * per cpu; and one per cpu on the uvhub (ADP_SZ) - */ - dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC; - bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); - BUG_ON(!bau_desc); - - gpa = uv_gpa(bau_desc); - n = uv_gpa_to_gnode(gpa); - m = ops.bau_gpa_to_offset(gpa); - - /* the 14-bit pnode */ - write_mmr_descriptor_base(pnode, - (n << UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT | m)); - /* - * Initializing all 8 (ITEMS_PER_DESC) descriptors for each - * cpu even though we only use the first one; one descriptor can - * describe a broadcast to 256 uv hubs. - */ - for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { - memset(bd2, 0, sizeof(struct bau_desc)); - /* - * BIOS uses legacy mode, but uv2 and uv3 hardware always - * uses native mode for selective broadcasts. - */ - uv2_3_hdr = &bd2->header.uv2_3_hdr; - uv2_3_hdr->swack_flag = 1; - uv2_3_hdr->base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; - } - for_each_present_cpu(cpu) { - if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) - continue; - bcp = &per_cpu(bau_control, cpu); - bcp->descriptor_base = bau_desc; - } -} - -/* - * initialize the destination side's receiving buffers - * entered for each uvhub in the partition - * - node is first node (kernel memory notion) on the uvhub - * - pnode is the uvhub's physical identifier - */ -static void pq_init(int node, int pnode) -{ - int cpu; - size_t plsize; - char *cp; - void *vp; - unsigned long gnode, first, last, tail; - struct bau_pq_entry *pqp; - struct bau_control *bcp; - - plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); - vp = kmalloc_node(plsize, GFP_KERNEL, node); - BUG_ON(!vp); - - pqp = (struct bau_pq_entry *)vp; - cp = (char *)pqp + 31; - pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); - - for_each_present_cpu(cpu) { - if (pnode != uv_cpu_to_pnode(cpu)) - continue; - /* for every cpu on this pnode: */ - bcp = &per_cpu(bau_control, cpu); - bcp->queue_first = pqp; - bcp->bau_msg_head = pqp; - bcp->queue_last = pqp + (DEST_Q_SIZE - 1); - } - - first = ops.bau_gpa_to_offset(uv_gpa(pqp)); - last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1))); - - /* - * Pre UV4, the gnode is required to locate the payload queue - * and the payload queue tail must be maintained by the kernel. - */ - bcp = &per_cpu(bau_control, smp_processor_id()); - if (bcp->uvhub_version <= UV_BAU_V3) { - tail = first; - gnode = uv_gpa_to_gnode(uv_gpa(pqp)); - first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; - write_mmr_payload_tail(pnode, tail); - } - - ops.write_payload_first(pnode, first); - ops.write_payload_last(pnode, last); - - /* in effect, all msg_type's are set to MSG_NOOP */ - memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); -} - -/* - * Initialization of each UV hub's structures - */ -static void __init init_uvhub(int uvhub, int vector, int base_pnode) -{ - int node; - int pnode; - unsigned long apicid; - - node = uvhub_to_first_node(uvhub); - pnode = uv_blade_to_pnode(uvhub); - - activation_descriptor_init(node, pnode, base_pnode); - - pq_init(node, pnode); - /* - * The below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS. - */ - apicid = uvhub_to_first_apicid(uvhub); - write_mmr_data_config(pnode, ((apicid << 32) | vector)); -} - -/* - * We will set BAU_MISC_CONTROL with a timeout period. - * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. - * So the destination timeout period has to be calculated from them. - */ -static int calculate_destination_timeout(void) -{ - unsigned long mmr_image; - int mult1; - int base; - int ret; - - /* same destination timeout for uv2 and uv3 */ - /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ - mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); - mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; - if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) - base = 80; - else - base = 10; - mult1 = mmr_image & UV2_ACK_MASK; - ret = mult1 * base; - - return ret; -} - -static void __init init_per_cpu_tunables(void) -{ - int cpu; - struct bau_control *bcp; - - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->baudisabled = 0; - if (nobau) - bcp->nobau = true; - bcp->statp = &per_cpu(ptcstats, cpu); - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = usec_2_cycles(2*timeout_us); - bcp->max_concurr = max_concurr; - bcp->max_concurr_const = max_concurr; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->cong_response_us = congested_respns_us; - bcp->cong_reps = congested_reps; - bcp->disabled_period = sec_2_cycles(disabled_period); - bcp->giveup_limit = giveup_limit; - spin_lock_init(&bcp->queue_lock); - spin_lock_init(&bcp->uvhub_lock); - spin_lock_init(&bcp->disable_lock); - } -} - -/* - * Scan all cpus to collect blade and socket summaries. - */ -static int __init get_cpu_topology(int base_pnode, - struct uvhub_desc *uvhub_descs, - unsigned char *uvhub_mask) -{ - int cpu; - int pnode; - int uvhub; - int socket; - struct bau_control *bcp; - struct uvhub_desc *bdp; - struct socket_desc *sdp; - - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - - memset(bcp, 0, sizeof(struct bau_control)); - - pnode = uv_cpu_hub_info(cpu)->pnode; - if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { - pr_emerg( - "cpu %d pnode %d-%d beyond %d; BAU disabled\n", - cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); - return 1; - } - - bcp->osnode = cpu_to_node(cpu); - bcp->partition_base_pnode = base_pnode; - - uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); - bdp = &uvhub_descs[uvhub]; - - bdp->num_cpus++; - bdp->uvhub = uvhub; - bdp->pnode = pnode; - - /* kludge: 'assuming' one node per socket, and assuming that - disabling a socket just leaves a gap in node numbers */ - socket = bcp->osnode & 1; - bdp->socket_mask |= (1 << socket); - sdp = &bdp->socket[socket]; - sdp->cpu_number[sdp->num_cpus] = cpu; - sdp->num_cpus++; - if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { - pr_emerg("%d cpus per socket invalid\n", - sdp->num_cpus); - return 1; - } - } - return 0; -} - -/* - * Each socket is to get a local array of pnodes/hubs. - */ -static void make_per_cpu_thp(struct bau_control *smaster) -{ - int cpu; - size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); - - smaster->thp = kzalloc_node(hpsz, GFP_KERNEL, smaster->osnode); - for_each_present_cpu(cpu) { - smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; - smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - } -} - -/* - * Each uvhub is to get a local cpumask. - */ -static void make_per_hub_cpumask(struct bau_control *hmaster) -{ - int sz = sizeof(cpumask_t); - - hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode); -} - -/* - * Initialize all the per_cpu information for the cpu's on a given socket, - * given what has been gathered into the socket_desc struct. - * And reports the chosen hub and socket masters back to the caller. - */ -static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, - struct bau_control **smasterp, - struct bau_control **hmasterp) -{ - int i, cpu, uvhub_cpu; - struct bau_control *bcp; - - for (i = 0; i < sdp->num_cpus; i++) { - cpu = sdp->cpu_number[i]; - bcp = &per_cpu(bau_control, cpu); - bcp->cpu = cpu; - if (i == 0) { - *smasterp = bcp; - if (!(*hmasterp)) - *hmasterp = bcp; - } - bcp->cpus_in_uvhub = bdp->num_cpus; - bcp->cpus_in_socket = sdp->num_cpus; - bcp->socket_master = *smasterp; - bcp->uvhub = bdp->uvhub; - if (is_uv2_hub()) - bcp->uvhub_version = UV_BAU_V2; - else if (is_uv3_hub()) - bcp->uvhub_version = UV_BAU_V3; - else if (is_uv4_hub()) - bcp->uvhub_version = UV_BAU_V4; - else { - pr_emerg("uvhub version not 1, 2, 3, or 4\n"); - return 1; - } - bcp->uvhub_master = *hmasterp; - uvhub_cpu = uv_cpu_blade_processor_id(cpu); - bcp->uvhub_cpu = uvhub_cpu; - - /* - * The ERROR and BUSY status registers are located pairwise over - * the STATUS_0 and STATUS_1 mmrs; each an array[32] of 2 bits. - */ - if (uvhub_cpu < UV_CPUS_PER_AS) { - bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - bcp->status_index = uvhub_cpu * UV_ACT_STATUS_SIZE; - } else { - bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - bcp->status_index = (uvhub_cpu - UV_CPUS_PER_AS) - * UV_ACT_STATUS_SIZE; - } - - if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { - pr_emerg("%d cpus per uvhub invalid\n", - bcp->uvhub_cpu); - return 1; - } - } - return 0; -} - -/* - * Summarize the blade and socket topology into the per_cpu structures. - */ -static int __init summarize_uvhub_sockets(int nuvhubs, - struct uvhub_desc *uvhub_descs, - unsigned char *uvhub_mask) -{ - int socket; - int uvhub; - unsigned short socket_mask; - - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - struct uvhub_desc *bdp; - struct bau_control *smaster = NULL; - struct bau_control *hmaster = NULL; - - if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) - continue; - - bdp = &uvhub_descs[uvhub]; - socket_mask = bdp->socket_mask; - socket = 0; - while (socket_mask) { - struct socket_desc *sdp; - if ((socket_mask & 1)) { - sdp = &bdp->socket[socket]; - if (scan_sock(sdp, bdp, &smaster, &hmaster)) - return 1; - make_per_cpu_thp(smaster); - } - socket++; - socket_mask = (socket_mask >> 1); - } - make_per_hub_cpumask(hmaster); - } - return 0; -} - -/* - * initialize the bau_control structure for each cpu - */ -static int __init init_per_cpu(int nuvhubs, int base_part_pnode) -{ - struct uvhub_desc *uvhub_descs; - unsigned char *uvhub_mask = NULL; - - if (is_uv3_hub() || is_uv2_hub()) - timeout_us = calculate_destination_timeout(); - - uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); - if (!uvhub_descs) - goto fail; - - uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); - if (!uvhub_mask) - goto fail; - - if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) - goto fail; - - if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) - goto fail; - - kfree(uvhub_descs); - kfree(uvhub_mask); - init_per_cpu_tunables(); - return 0; - -fail: - kfree(uvhub_descs); - kfree(uvhub_mask); - return 1; -} - -static const struct bau_operations uv2_3_bau_ops __initconst = { - .bau_gpa_to_offset = uv_gpa_to_offset, - .read_l_sw_ack = read_mmr_sw_ack, - .read_g_sw_ack = read_gmmr_sw_ack, - .write_l_sw_ack = write_mmr_sw_ack, - .write_g_sw_ack = write_gmmr_sw_ack, - .write_payload_first = write_mmr_payload_first, - .write_payload_last = write_mmr_payload_last, - .wait_completion = uv2_3_wait_completion, -}; - -static const struct bau_operations uv4_bau_ops __initconst = { - .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram, - .read_l_sw_ack = read_mmr_proc_sw_ack, - .read_g_sw_ack = read_gmmr_proc_sw_ack, - .write_l_sw_ack = write_mmr_proc_sw_ack, - .write_g_sw_ack = write_gmmr_proc_sw_ack, - .write_payload_first = write_mmr_proc_payload_first, - .write_payload_last = write_mmr_proc_payload_last, - .wait_completion = uv4_wait_completion, -}; - -/* - * Initialization of BAU-related structures - */ -static int __init uv_bau_init(void) -{ - int uvhub; - int pnode; - int nuvhubs; - int cur_cpu; - int cpus; - int vector; - cpumask_var_t *mask; - - if (!is_uv_system()) - return 0; - - if (is_uv4_hub()) - ops = uv4_bau_ops; - else if (is_uv3_hub()) - ops = uv2_3_bau_ops; - else if (is_uv2_hub()) - ops = uv2_3_bau_ops; - - nuvhubs = uv_num_possible_blades(); - if (nuvhubs < 2) { - pr_crit("UV: BAU disabled - insufficient hub count\n"); - goto err_bau_disable; - } - - for_each_possible_cpu(cur_cpu) { - mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); - zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); - } - - uv_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - cpus = uv_blade_nr_possible_cpus(uvhub); - if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode)) - uv_base_pnode = uv_blade_to_pnode(uvhub); - } - - /* software timeouts are not supported on UV4 */ - if (is_uv3_hub() || is_uv2_hub()) - enable_timeouts(); - - if (init_per_cpu(nuvhubs, uv_base_pnode)) { - pr_crit("UV: BAU disabled - per CPU init failed\n"); - goto err_bau_disable; - } - - vector = UV_BAU_MESSAGE; - for_each_possible_blade(uvhub) { - if (uv_blade_nr_possible_cpus(uvhub)) - init_uvhub(uvhub, vector, uv_base_pnode); - } - - for_each_possible_blade(uvhub) { - if (uv_blade_nr_possible_cpus(uvhub)) { - unsigned long val; - unsigned long mmr; - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - val = 1L << 63; - write_gmmr_activation(pnode, val); - mmr = 1; /* should be 1 to broadcast to both sockets */ - write_mmr_data_broadcast(pnode, mmr); - } - } - - return 0; - -err_bau_disable: - - for_each_possible_cpu(cur_cpu) - free_cpumask_var(per_cpu(uv_flush_tlb_mask, cur_cpu)); - - set_bau_off(); - nobau_perm = 1; - - return -EINVAL; -} -core_initcall(uv_bau_init); -fs_initcall(uv_ptc_init); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index abb6075397f0..18ca2261cc9a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -90,15 +90,15 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); if (ret >= 0) { - if (info->uv_limit == UV_AFFINITY_CPU) + if (info->uv.limit == UV_AFFINITY_CPU) irq_set_status_flags(virq, IRQ_NO_BALANCING); else irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - chip_data->pnode = uv_blade_to_pnode(info->uv_blade); - chip_data->offset = info->uv_offset; + chip_data->pnode = uv_blade_to_pnode(info->uv.blade); + chip_data->offset = info->uv.offset; irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data, - handle_percpu_irq, NULL, info->uv_name); + handle_percpu_irq, NULL, info->uv.name); } else { kfree(chip_data); } @@ -193,10 +193,10 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, init_irq_alloc_info(&info, cpumask_of(cpu)); info.type = X86_IRQ_ALLOC_TYPE_UV; - info.uv_limit = limit; - info.uv_blade = mmr_blade; - info.uv_offset = mmr_offset; - info.uv_name = irq_name; + info.uv.limit = limit; + info.uv.blade = mmr_blade; + info.uv.offset = mmr_offset; + info.uv.name = irq_name; return irq_domain_alloc_irqs(domain, 1, uv_blade_to_memory_nid(mmr_blade), &info); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 9d08ff5a755e..eafc530c8767 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -2,8 +2,9 @@ /* * SGI NMI support routines * - * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Mike Travis + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2007-2017 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) Mike Travis */ #include <linux/cpu.h> @@ -54,6 +55,19 @@ static struct uv_hub_nmi_s **uv_hub_nmi_list; DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); +/* Newer SMM NMI handler, not present in all systems */ +static unsigned long uvh_nmi_mmrx; /* UVH_EVENT_OCCURRED0/1 */ +static unsigned long uvh_nmi_mmrx_clear; /* UVH_EVENT_OCCURRED0/1_ALIAS */ +static int uvh_nmi_mmrx_shift; /* UVH_EVENT_OCCURRED0/1_EXTIO_INT0_SHFT */ +static char *uvh_nmi_mmrx_type; /* "EXTIO_INT0" */ + +/* Non-zero indicates newer SMM NMI handler present */ +static unsigned long uvh_nmi_mmrx_supported; /* UVH_EXTIO_INT0_BROADCAST */ + +/* Indicates to BIOS that we want to use the newer SMM NMI handler */ +static unsigned long uvh_nmi_mmrx_req; /* UVH_BIOS_KERNEL_MMR_ALIAS_2 */ +static int uvh_nmi_mmrx_req_shift; /* 62 */ + /* UV hubless values */ #define NMI_CONTROL_PORT 0x70 #define NMI_DUMMY_PORT 0x71 @@ -227,13 +241,41 @@ static inline bool uv_nmi_action_is(const char *action) /* Setup which NMI support is present in system */ static void uv_nmi_setup_mmrs(void) { - if (uv_read_local_mmr(UVH_NMI_MMRX_SUPPORTED)) { - uv_write_local_mmr(UVH_NMI_MMRX_REQ, - 1UL << UVH_NMI_MMRX_REQ_SHIFT); - nmi_mmr = UVH_NMI_MMRX; - nmi_mmr_clear = UVH_NMI_MMRX_CLEAR; - nmi_mmr_pending = 1UL << UVH_NMI_MMRX_SHIFT; - pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMRX_TYPE); + /* First determine arch specific MMRs to handshake with BIOS */ + if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { + uvh_nmi_mmrx = UVH_EVENT_OCCURRED0; + uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED0_ALIAS; + uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT; + uvh_nmi_mmrx_type = "OCRD0-EXTIO_INT0"; + + uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST; + uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; + uvh_nmi_mmrx_req_shift = 62; + + } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { + uvh_nmi_mmrx = UVH_EVENT_OCCURRED1; + uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED1_ALIAS; + uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT; + uvh_nmi_mmrx_type = "OCRD1-EXTIO_INT0"; + + uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST; + uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; + uvh_nmi_mmrx_req_shift = 62; + + } else { + pr_err("UV:%s:cannot find EVENT_OCCURRED*_EXTIO_INT0\n", + __func__); + return; + } + + /* Then find out if new NMI is supported */ + if (likely(uv_read_local_mmr(uvh_nmi_mmrx_supported))) { + uv_write_local_mmr(uvh_nmi_mmrx_req, + 1UL << uvh_nmi_mmrx_req_shift); + nmi_mmr = uvh_nmi_mmrx; + nmi_mmr_clear = uvh_nmi_mmrx_clear; + nmi_mmr_pending = 1UL << uvh_nmi_mmrx_shift; + pr_info("UV: SMI NMI support: %s\n", uvh_nmi_mmrx_type); } else { nmi_mmr = UVH_NMI_MMR; nmi_mmr_clear = UVH_NMI_MMR_CLEAR; @@ -1049,5 +1091,5 @@ void __init uv_nmi_setup_hubless(void) /* Ensure NMI enabled in Processor Interface Reg: */ uv_reassert_nmi(); uv_register_nmi_notifier(); - pr_info("UV: Hubless NMI enabled\n"); + pr_info("UV: PCH NMI enabled\n"); } diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index f82a1337a608..54663f3e00cb 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -2,6 +2,7 @@ /* * SGI RTC clock/timer routines. * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. * Copyright (c) Dimitri Sivanich */ @@ -52,7 +53,7 @@ struct uv_rtc_timer_head { struct { int lcpu; /* systemwide logical cpu number */ u64 expires; /* next timer expiration for this cpu */ - } cpu[1]; + } cpu[]; }; /* @@ -84,10 +85,8 @@ static void uv_rtc_send_IPI(int cpu) /* Check for an RTC interrupt pending */ static int uv_intr_pending(int pnode) { - if (is_uvx_hub()) - return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & - UVXH_EVENT_OCCURRED2_RTC_1_MASK; - return 0; + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED2) & + UVH_EVENT_OCCURRED2_RTC_1_MASK; } /* Setup interrupt and return non-zero if early expiration occurred. */ @@ -101,8 +100,8 @@ static int uv_setup_intr(int cpu, u64 expires) UVH_RTC1_INT_CONFIG_M_MASK); uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, - UVXH_EVENT_OCCURRED2_RTC_1_MASK); + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED2_ALIAS, + UVH_EVENT_OCCURRED2_RTC_1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); @@ -148,9 +147,8 @@ static __init int uv_rtc_allocate_timers(void) struct uv_rtc_timer_head *head = blade_info[bid]; if (!head) { - head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + - (uv_blade_nr_possible_cpus(bid) * - 2 * sizeof(u64)), + head = kmalloc_node(struct_size(head, cpu, + uv_blade_nr_possible_cpus(bid)), GFP_KERNEL, nid); if (!head) { uv_rtc_deallocate_timers(); diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 2961234d0795..7b37a412f829 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -14,9 +14,9 @@ #include "../boot/string.h" -u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); +u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(".kexec-purgatory"); -struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); +struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(".kexec-purgatory"); static int verify_sha256_digest(void) { diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1ed1208931e0..22fda7d99159 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,6 +9,7 @@ #include <asm/realmode.h> #include <asm/tlbflush.h> #include <asm/crash.h> +#include <asm/sev-es.h> struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -38,6 +39,25 @@ void __init reserve_real_mode(void) crash_reserve_low_1M(); } +static void sme_sev_setup_real_mode(struct trampoline_header *th) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + if (sme_active()) + th->flags |= TH_FLAGS_SME_ACTIVE; + + if (sev_es_active()) { + /* + * Skip the call to verify_cpu() in secondary_startup_64 as it + * will cause #VC exceptions when the AP can't handle them yet. + */ + th->start = (u64) secondary_startup_64_no_verify; + + if (sev_es_setup_ap_jump_table(real_mode_header)) + panic("Failed to get/update SEV-ES AP Jump Table"); + } +#endif +} + static void __init setup_real_mode(void) { u16 real_mode_seg; @@ -104,13 +124,13 @@ static void __init setup_real_mode(void) *trampoline_cr4_features = mmu_cr4_features; trampoline_header->flags = 0; - if (sme_active()) - trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_top_pgt[511].pgd; #endif + + sme_sev_setup_real_mode(trampoline_header); } /* diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index af04512c02d9..8c1db5bf5d78 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header) /* SMP trampoline */ .long pa_trampoline_start .long pa_trampoline_header +#ifdef CONFIG_AMD_MEM_ENCRYPT + .long pa_sev_es_trampoline_start +#endif #ifdef CONFIG_X86_64 .long pa_trampoline_pgd; #endif diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 251758ed7443..84c5d1b33d10 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start) testl %eax, %eax # Check for return code jnz no_longmode +.Lswitch_to_protected: /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default @@ -80,6 +81,25 @@ no_longmode: jmp no_longmode SYM_CODE_END(trampoline_start) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* SEV-ES supports non-zero IP for entry points - no alignment needed */ +SYM_CODE_START(sev_es_trampoline_start) + cli # We should be safe anyway + + LJMPW_RM(1f) +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + # Setup stack + movl $rm_stack_end, %esp + + jmp .Lswitch_to_protected +SYM_CODE_END(sev_es_trampoline_start) +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + #include "../kernel/verify_cpu.S" .section ".text32","ax" diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index a42015b305f4..af38469afd14 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod) END { if (awkchecked != "") exit 1 + + print "#ifndef __BOOT_COMPRESSED\n" + # print escape opcode map's array print "/* Escape opcode map array */" print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ @@ -388,6 +391,51 @@ END { for (j = 0; j < max_lprefix; j++) if (atable[i,j]) print " ["i"]["j"] = "atable[i,j]"," - print "};" + print "};\n" + + print "#else /* !__BOOT_COMPRESSED */\n" + + print "/* Escape opcode map array */" + print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* Group opcode map array */" + print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* AVX opcode map array */" + print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "static void inat_init_tables(void)" + print "{" + + # print escape opcode map's array + print "\t/* Print Escape opcode map array */" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";" + print "" + + # print group opcode map's array + print "\t/* Print Group opcode map array */" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";" + print "" + # print AVX opcode map's array + print "\t/* Print AVX opcode map array */" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + + print "}" + print "#endif" } diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index ff6bba2c8ab6..b07824500363 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -20,22 +20,6 @@ */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -/* - * Note: when you get a NULL pointer exception here this means someone - * passed in an incorrect kernel address to one of these functions. - * - * If you use these functions directly please don't forget the - * access_ok(). - */ - -static __inline__ -__wsum csum_partial_copy_nocheck(const void *src, void *dst, - int len, __wsum sum) -{ - memcpy(dst, src, len); - return csum_partial(dst, len, sum); -} - /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h index b9ac7c9eb72c..0b13c2947ad1 100644 --- a/arch/x86/um/asm/checksum_32.h +++ b/arch/x86/um/asm/checksum_32.h @@ -35,27 +35,4 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, return csum_fold(sum); } -/* - * Copy and checksum to user - */ -#define HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user(const void *src, - void __user *dst, - int len, __wsum sum, int *err_ptr) -{ - if (access_ok(dst, len)) { - if (copy_to_user(dst, src, len)) { - *err_ptr = -EFAULT; - return (__force __wsum)-1; - } - - return csum_partial(src, len, sum); - } - - if (len) - *err_ptr = -EFAULT; - - return (__force __wsum)-1; /* invalid checksum */ -} - #endif diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 09a085bde0d4..1401899dee9b 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -52,14 +52,6 @@ static const int reg_offsets[] = int putreg(struct task_struct *child, int regno, unsigned long value) { -#ifdef TIF_IA32 - /* - * Some code in the 64bit emulation may not be 64bit clean. - * Don't take any chances. - */ - if (test_tsk_thread_flag(child, TIF_IA32)) - value &= 0xffffffff; -#endif switch (regno) { case R8: case R9: @@ -137,10 +129,7 @@ int poke_user(struct task_struct *child, long addr, long data) unsigned long getreg(struct task_struct *child, int regno) { unsigned long mask = ~0UL; -#ifdef TIF_IA32 - if (test_tsk_thread_flag(child, TIF_IA32)) - mask = 0xffffffff; -#endif + switch (regno) { case R8: case R9: diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 27361cbb7ca9..fdcd58af707a 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -8,7 +8,7 @@ #include <sysdep/mcontext.h> #include <sys/ucontext.h> -void __attribute__ ((__section__ (".__syscall_stub"))) +void __section(".__syscall_stub") stub_segv_handler(int sig, siginfo_t *info, void *p) { ucontext_t *uc = p; diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index c51dd8363d25..bae61554abcc 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -2,7 +2,7 @@ #include <stdio.h> #include <stddef.h> #include <signal.h> -#include <sys/poll.h> +#include <poll.h> #include <sys/mman.h> #include <sys/user.h> #define __FRAME_OFFSETS diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 205b1176084f..aa9f50fccc5d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); * NB: needs to live in .data because it's used by xen_prepare_pvh which runs * before clearing the bss. */ -uint32_t xen_start_flags __attribute__((section(".data"))) = 0; +uint32_t xen_start_flags __section(".data") = 0; EXPORT_SYMBOL(xen_start_flags); /* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 22e741e0b10c..4409306364dc 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -32,7 +32,7 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <xen/xen.h> #include <xen/events.h> @@ -1014,8 +1014,6 @@ void __init xen_setup_vcpu_info_placement(void) } static const struct pv_info xen_info __initconst = { - .shared_kernel_pmd = 0, - .extra_user_64bit_cs = FLAT_USER_CS64, .name = "Xen", }; @@ -1302,7 +1300,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * any NUMA information the kernel tries to get from ACPI will * be meaningless. Prevent it from trying. */ - acpi_numa = -1; + disable_srat(); #endif WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); @@ -1314,10 +1312,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* keep using Xen gdt for now; no urgent need to change it */ - - pv_info.kernel_rpl = 0; - /* * We used to do this in xen_arch_setup, but that is too late * on AMD were early_cpu_init (run before ->arch_setup()) calls @@ -1376,6 +1370,15 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; xen_boot_params_init_edd(); + +#ifdef CONFIG_ACPI + /* + * Disable selecting "Firmware First mode" for correctable + * memory errors, as this is the duty of the hypervisor to + * decide. + */ + acpi_disable_cmcff = 1; +#endif } if (!boot_params.screen_info.orig_video_isVGA) diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 80a79db72fcf..0d5e34b9e6f9 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -21,7 +21,7 @@ * The variable xen_pvh needs to live in the data segment since it is used * after startup_{32|64} is invoked, which will clear the .bss segment. */ -bool xen_pvh __attribute__((section(".data"))) = 0; +bool xen_pvh __section(".data") = 0; void __init xen_pvh_init(struct boot_params *boot_params) { diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 4988e19598c8..1e681bf62561 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -25,6 +25,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; + int idx; } gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, @@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) } } +static int gnttab_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct gnttab_vm_area *area = data; + + area->ptes[area->idx++] = pte; + return 0; +} + static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) { area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); if (area->ptes == NULL) return -ENOMEM; - - area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); - if (area->area == NULL) { - kfree(area->ptes); - return -ENOMEM; - } - + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); + if (!area->area) + goto out_free_ptes; + if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, + PAGE_SIZE * nr_frames, gnttab_apply, area)) + goto out_free_vm_area; return 0; +out_free_vm_area: + free_vm_area(area->area); +out_free_ptes: + kfree(area->ptes); + return -ENOMEM; } static void arch_gnttab_vfree(struct gnttab_vm_area *area) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 3273c985d3dd..cf2ade864c30 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -285,13 +285,6 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } -static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); - __xen_set_pte(ptep, pteval); -} - pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -1149,7 +1142,7 @@ static void __init xen_pagetable_p2m_free(void) * We could be in __ka space. * We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or - * xen_start_info->shared_info they are in going to crash. Fortunatly + * xen_start_info->shared_info they are in going to crash. Fortunately * we have already revectored in xen_setup_kernel_pagetable. */ size = roundup(size, PMD_SIZE); @@ -2105,7 +2098,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .release_pmd = xen_release_pmd_init, .set_pte = xen_set_pte_init, - .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, .ptep_modify_prot_start = __ptep_modify_prot_start, diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 33293ce01d8d..19ae3e4fe4e9 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -2,7 +2,7 @@ /* Glue code to lib/swiotlb-xen.c */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/pci.h> #include <xen/swiotlb-xen.h> diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 2097fa0ebdb5..c1b2f764b29a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -88,14 +88,17 @@ int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_callfunc_irq, cpu).irq = rc; per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; - debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); - rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, - IRQF_PERCPU | IRQF_NOBALANCING, - debug_name, NULL); - if (rc < 0) - goto fail; - per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; + if (!xen_fifo_events) { + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, + xen_debug_interrupt, + IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_debug_irq, cpu).irq = rc; + per_cpu(xen_debug_irq, cpu).name = debug_name; + } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 45d556f71858..9546c3384c75 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -29,6 +29,8 @@ extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; +extern bool xen_fifo_events; + void xen_setup_mfn_list_list(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); |