summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig19
-rw-r--r--arch/x86/Kconfig.debug27
-rw-r--r--arch/x86/Makefile12
-rw-r--r--arch/x86/boot/boot.h2
-rw-r--r--arch/x86/boot/compressed/kaslr.c11
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/entry/entry_32.S171
-rw-r--r--arch/x86/entry/entry_64.S15
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c24
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c11
-rw-r--r--arch/x86/entry/vdso/vma.c7
-rw-r--r--arch/x86/events/amd/iommu.c325
-rw-r--r--arch/x86/events/amd/iommu.h18
-rw-r--r--arch/x86/events/amd/uncore.c77
-rw-r--r--arch/x86/events/intel/bts.c16
-rw-r--r--arch/x86/events/intel/core.c24
-rw-r--r--arch/x86/events/intel/ds.c2
-rw-r--r--arch/x86/events/intel/lbr.c3
-rw-r--r--arch/x86/events/intel/pt.c129
-rw-r--r--arch/x86/events/intel/pt.h2
-rw-r--r--arch/x86/events/perf_event.h1
-rw-r--r--arch/x86/hyperv/hv_init.c48
-rw-r--r--arch/x86/include/asm/apic.h7
-rw-r--r--arch/x86/include/asm/atomic.h33
-rw-r--r--arch/x86/include/asm/atomic64_64.h46
-rw-r--r--arch/x86/include/asm/bug.h80
-rw-r--r--arch/x86/include/asm/clocksource.h3
-rw-r--r--arch/x86/include/asm/cmpxchg.h70
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/intel-family.h6
-rw-r--r--arch/x86/include/asm/intel_rdt.h157
-rw-r--r--arch/x86/include/asm/kprobes.h7
-rw-r--r--arch/x86/include/asm/mce.h12
-rw-r--r--arch/x86/include/asm/mshyperv.h54
-rw-r--r--arch/x86/include/asm/page_64.h16
-rw-r--r--arch/x86/include/asm/pmem.h42
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/include/asm/smp.h35
-rw-r--r--arch/x86/include/asm/uaccess.h70
-rw-r--r--arch/x86/include/asm/uaccess_32.h127
-rw-r--r--arch/x86/include/asm/uaccess_64.h128
-rw-r--r--arch/x86/include/asm/unwind.h2
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h82
-rw-r--r--arch/x86/include/asm/vdso.h1
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/apic/apic.c10
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c350
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c125
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_schemata.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c397
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c573
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c12
-rw-r--r--arch/x86/kernel/dumpstack_64.c10
-rw-r--r--arch/x86/kernel/early_printk.c5
-rw-r--r--arch/x86/kernel/ftrace.c18
-rw-r--r--arch/x86/kernel/ftrace_32.S244
-rw-r--r--arch/x86/kernel/ftrace_64.S (renamed from arch/x86/kernel/mcount_64.S)6
-rw-r--r--arch/x86/kernel/irq.c9
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kprobes/common.h4
-rw-r--r--arch/x86/kernel/kprobes/core.c149
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c13
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/pci-calgary_64.c5
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/setup.c26
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/signal_compat.c4
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/traps.c50
-rw-r--r--arch/x86/kernel/unwind_frame.c234
-rw-r--r--arch/x86/kernel/unwind_guess.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kvm/vmx.c15
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lib/clear_page_64.S17
-rw-r--r--arch/x86/lib/delay.c7
-rw-r--r--arch/x86/lib/usercopy.c54
-rw-r--r--arch/x86/lib/usercopy_32.c288
-rw-r--r--arch/x86/lib/usercopy_64.c13
-rw-r--r--arch/x86/mm/init.c41
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c84
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--arch/x86/platform/efi/quirks.c4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile3
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c108
-rw-r--r--arch/x86/platform/uv/tlb_uv.c195
-rw-r--r--arch/x86/platform/uv/uv_time.c2
-rw-r--r--arch/x86/ras/Kconfig14
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/bug.c21
-rw-r--r--arch/x86/xen/time.c4
105 files changed, 2919 insertions, 2386 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bde14451e54..8d4f87e5bba3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -98,7 +98,6 @@ config X86
select HAVE_ACPI_APEI_NMI if ACPI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
- select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
@@ -128,7 +127,7 @@ config X86
select HAVE_EBPF_JIT if X86_64
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EXIT_THREAD
- select HAVE_FENTRY if X86_64
+ select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
@@ -1045,6 +1044,14 @@ config X86_MCE
The action the kernel takes depends on the severity of the problem,
ranging from warning messages to halting the machine.
+config X86_MCELOG_LEGACY
+ bool "Support for deprecated /dev/mcelog character device"
+ depends on X86_MCE
+ ---help---
+ Enable support for /dev/mcelog which is needed by the old mcelog
+ userspace logging daemon. Consider switching to the new generation
+ rasdaemon solution.
+
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
@@ -1074,7 +1081,7 @@ config X86_MCE_THRESHOLD
def_bool y
config X86_MCE_INJECT
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
@@ -1968,7 +1975,7 @@ config RELOCATABLE
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
depends on RELOCATABLE
- default n
+ default y
---help---
In support of Kernel Address Space Layout Randomization (KASLR),
this randomizes the physical address at which the kernel image
@@ -1998,7 +2005,7 @@ config RANDOMIZE_BASE
theoretically possible, but the implementations are further
limited due to memory layouts.
- If unsure, say N.
+ If unsure, say Y.
# Relocation on x86 needs some additional build support
config X86_NEED_RELOCS
@@ -2047,7 +2054,7 @@ config RANDOMIZE_MEMORY
configuration have in average 30,000 different possible virtual
addresses for each memory section.
- If unsure, say N.
+ If unsure, say Y.
config RANDOMIZE_MEMORY_PHYSICAL_PADDING
hex "Physical memory mapping padding" if EXPERT
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 63c1d13aaf9f..fcb7604172ce 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,9 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
+config EARLY_PRINTK_USB
+ bool
+
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
@@ -23,19 +26,20 @@ config EARLY_PRINTK
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash.
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
---help---
Write kernel log output directly into the EHCI debug port.
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash. You need usb debug device.
config EARLY_PRINTK_EFI
@@ -48,6 +52,25 @@ config EARLY_PRINTK_EFI
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized.
+config EARLY_PRINTK_USB_XDBC
+ bool "Early printk via the xHCI debug port"
+ depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
+ ---help---
+ Write kernel log output directly into the xHCI debug port.
+
+ One use for this feature is kernel debugging, for example when your
+ machine crashes very early before the regular console code is
+ initialized. Other uses include simpler, lockless logging instead of
+ a full-blown printk console driver + klogd.
+
+ For normal production environments this is normally not recommended,
+ because it doesn't feed events into klogd/syslogd and doesn't try to
+ print anything on the screen.
+
+ You should normally say N here, unless you want to debug early
+ crashes or need a very simple printk logging facility.
+
config X86_PTDUMP_CORE
def_bool n
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a94a4d10f2df..4430dd489620 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -88,10 +88,10 @@ else
KBUILD_CFLAGS += -m64
# Align jump targets to 1 byte, not the default 16 bytes:
- KBUILD_CFLAGS += -falign-jumps=1
+ KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1)
# Pack loops tightly as well:
- KBUILD_CFLAGS += -falign-loops=1
+ KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
# Don't autogenerate traditional x87 instructions
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
@@ -154,6 +154,14 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
else
ifeq ($(call cc-option-yn, -mfentry), n)
ACCUMULATE_OUTGOING_ARGS := 1
+
+ # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
+ # If '-Os' is enabled, disable it and print a warning.
+ ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
+ $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+ endif
+
endif
endif
endif
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 9b42b6d1e902..ef5a9cc66fb8 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -16,7 +16,7 @@
#ifndef BOOT_BOOT_H
#define BOOT_BOOT_H
-#define STACK_SIZE 512 /* Minimum number of bytes for stack */
+#define STACK_SIZE 1024 /* Minimum number of bytes for stack */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 6d9a546ec7ae..54c24f0a43d3 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -597,10 +597,17 @@ void choose_random_location(unsigned long input,
add_identity_map(random_addr, output_size);
*output = random_addr;
}
+
+ /*
+ * This loads the identity mapping page table.
+ * This should only be done if a new physical address
+ * is found for the kernel, otherwise we should keep
+ * the old page table to make it be like the "nokaslr"
+ * case.
+ */
+ finalize_identity_maps();
}
- /* This actually loads the identity pagetable on x86_64. */
- finalize_identity_maps();
/* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
if (IS_ENABLED(CONFIG_X86_64))
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 5fa6ee2c2dde..6cf79e1a6830 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -57,6 +57,8 @@ CONFIG_EFI=y
CONFIG_HZ_1000=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
# CONFIG_COMPAT_VDSO is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6205d3b81e6d..de45f57b410d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -55,6 +55,8 @@ CONFIG_EFI=y
CONFIG_HZ_1000=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
# CONFIG_COMPAT_VDSO is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 57f7ec35216e..50bc26949e9e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -35,16 +35,13 @@
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
-#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <asm/export.h>
#include <asm/frame.h>
.section .entry.text, "ax"
@@ -585,7 +582,7 @@ ENTRY(iret_exc )
* will soon execute iret and the tracer was already set to
* the irqstate after the IRET:
*/
- DISABLE_INTERRUPTS(CLBR_EAX)
+ DISABLE_INTERRUPTS(CLBR_ANY)
lss (%esp), %esp /* switch to espfix segment */
jmp .Lrestore_nocheck
#endif
@@ -886,172 +883,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
#endif /* CONFIG_HYPERV */
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- pushl $0 /* Pass NULL as regs pointer */
- movl 4*4(%esp), %eax
- movl 0x4(%ebp), %edx
- movl function_trace_op, %ecx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- addl $4, %esp /* skip NULL pointer */
- popl %edx
- popl %ecx
- popl %eax
-.Lftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-/* This is weak to keep gas from relaxing the jumps */
-WEAK(ftrace_stub)
- ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- pushf /* push flags before compare (in cs location) */
-
- /*
- * i386 does not save SS and ESP when coming from kernel.
- * Instead, to get sp, &regs->sp is used (see ptrace.h).
- * Unfortunately, that means eflags must be at the same location
- * as the current return ip is. We move the return ip into the
- * ip location, and move flags into the return ip location.
- */
- pushl 4(%esp) /* save return ip into ip slot */
-
- pushl $0 /* Load 0 into orig_ax */
- pushl %gs
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
-
- movl 13*4(%esp), %eax /* Get the saved flags */
- movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
- /* clobbering return ip */
- movl $__KERNEL_CS, 13*4(%esp)
-
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */
- subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
- movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
- pushl %esp /* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- addl $4, %esp /* Skip pt_regs */
- movl 14*4(%esp), %eax /* Move flags back into cs */
- movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
- movl 12*4(%esp), %eax /* Get return ip from regs->ip */
- movl %eax, 14*4(%esp) /* Put return ip back for ret */
-
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %eax
- popl %ds
- popl %es
- popl %fs
- popl %gs
- addl $8, %esp /* Skip orig_ax and ip */
- popf /* Pop flags at end (no addl to corrupt flags) */
- jmp .Lftrace_ret
-
- popf
- jmp ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $__PAGE_OFFSET, %esp
- jb ftrace_stub /* Paging not enabled yet? */
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz .Ltrace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-.Ltrace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-EXPORT_SYMBOL(mcount)
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- lea 0x4(%ebp), %edx
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %eax
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
-#endif
-
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
ASM_CLAC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f07b4efb34d5..607d72c4a485 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,7 +212,7 @@ entry_SYSCALL_64_fastpath:
* If we see that no exit work is required (which we are required
* to check with IRQs off), then we can go straight to SYSRET64.
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
@@ -233,7 +233,7 @@ entry_SYSCALL_64_fastpath:
* raise(3) will trigger this, for example. IRQs are off.
*/
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
+ ENABLE_INTERRUPTS(CLBR_ANY)
SAVE_EXTRA_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
@@ -340,7 +340,7 @@ ENTRY(stub_ptregs_64)
* Called from fast path -- disable IRQs again, pop return address
* and jump to slow path
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
popq %rax
jmp entry_SYSCALL64_slow_path
@@ -515,7 +515,7 @@ common_interrupt:
interrupt do_IRQ
/* 0(%rsp): old RSP */
ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
@@ -1048,7 +1048,7 @@ END(paranoid_entry)
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
@@ -1153,10 +1153,9 @@ END(error_entry)
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
ENTRY(error_exit)
- movl %ebx, %eax
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- testl %eax, %eax
+ testl %ebx, %ebx
jnz retint_kernel
jmp retint_user
END(error_exit)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9d4d6e138311..fa8dbfcf7ed3 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,6 +17,7 @@
#include <asm/unistd.h>
#include <asm/msr.h>
#include <asm/pvclock.h>
+#include <asm/mshyperv.h>
#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
@@ -32,6 +33,11 @@ extern u8 pvclock_page
__attribute__((visibility("hidden")));
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+extern u8 hvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
@@ -141,6 +147,20 @@ static notrace u64 vread_pvclock(int *mode)
return last;
}
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+static notrace u64 vread_hvclock(int *mode)
+{
+ const struct ms_hyperv_tsc_page *tsc_pg =
+ (const struct ms_hyperv_tsc_page *)&hvclock_page;
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick != U64_MAX)
+ return current_tick;
+
+ *mode = VCLOCK_NONE;
+ return 0;
+}
+#endif
notrace static u64 vread_tsc(void)
{
@@ -173,6 +193,10 @@ notrace static inline u64 vgetsns(int *mode)
else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
cycles = vread_pvclock(mode);
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+ else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
+ cycles = vread_hvclock(mode);
+#endif
else
return 0;
v = (cycles - gtod->cycle_last) & gtod->mask;
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index a708aa90b507..8ebb4b6454fe 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
pvclock_page = vvar_start + PAGE_SIZE;
+ hvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 491020b2826d..0780a443a53b 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -74,6 +74,7 @@ enum {
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
+ sym_hvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -82,6 +83,7 @@ const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
+ sym_hvclock_page,
};
struct vdso_sym {
@@ -94,6 +96,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
+ [sym_hvclock_page] = {"hvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 7853b53959cd..3f9d1a83891a 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -30,8 +30,10 @@ static int __init vdso32_setup(char *s)
{
vdso32_enabled = simple_strtoul(s, NULL, 0);
- if (vdso32_enabled > 1)
+ if (vdso32_enabled > 1) {
pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
+ vdso32_enabled = 0;
+ }
return 1;
}
@@ -62,13 +64,18 @@ subsys_initcall(sysenter_setup);
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
+static const int zero;
+static const int one = 1;
+
static struct ctl_table abi_table2[] = {
{
.procname = "vsyscall32",
.data = &vdso32_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (int *)&zero,
+ .extra2 = (int *)&one,
},
{}
};
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5c5d4d7618e6..139ad7726e10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -22,6 +22,7 @@
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
+#include <asm/mshyperv.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
@@ -121,6 +122,12 @@ static int vvar_fault(const struct vm_special_mapping *sm,
vmf->address,
__pa(pvti) >> PAGE_SHIFT);
}
+ } else if (sym_offset == image->sym_hvclock_page) {
+ struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+
+ if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
+ ret = vm_insert_pfn(vma, vmf->address,
+ vmalloc_to_pfn(tsc_pg));
}
if (ret == 0 || ret == -EBUSY)
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index b28200dea715..3641e24fdac5 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -11,6 +11,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) "perf/amd_iommu: " fmt
+
#include <linux/perf_event.h>
#include <linux/init.h>
#include <linux/cpumask.h>
@@ -21,44 +23,42 @@
#define COUNTER_SHIFT 16
-#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
+#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
+#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL)
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
-#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL)
+#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL)
-static struct perf_amd_iommu __perf_iommu;
+#define IOMMU_NAME_SIZE 16
struct perf_amd_iommu {
+ struct list_head list;
struct pmu pmu;
+ struct amd_iommu *iommu;
+ char name[IOMMU_NAME_SIZE];
u8 max_banks;
u8 max_counters;
u64 cntr_assign_mask;
raw_spinlock_t lock;
- const struct attribute_group *attr_groups[4];
};
-#define format_group attr_groups[0]
-#define cpumask_group attr_groups[1]
-#define events_group attr_groups[2]
-#define null_group attr_groups[3]
+static LIST_HEAD(perf_amd_iommu_list);
/*---------------------------------------------
* sysfs format attributes
*---------------------------------------------*/
PMU_FORMAT_ATTR(csource, "config:0-7");
PMU_FORMAT_ATTR(devid, "config:8-23");
-PMU_FORMAT_ATTR(pasid, "config:24-39");
-PMU_FORMAT_ATTR(domid, "config:40-55");
+PMU_FORMAT_ATTR(domid, "config:24-39");
+PMU_FORMAT_ATTR(pasid, "config:40-59");
PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
static struct attribute *iommu_format_attrs[] = {
&format_attr_csource.attr,
@@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = {
/*---------------------------------------------
* sysfs events attributes
*---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+ .name = "events",
+};
+
struct amd_iommu_event_desc {
struct kobj_attribute attr;
const char *event;
@@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
/*---------------------------------------------*/
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
{
+ struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+ int max_cntrs = piommu->max_counters;
+ int max_banks = piommu->max_banks;
+ u32 shift, bank, cntr;
unsigned long flags;
- int shift, bank, cntr, retval;
- int max_banks = perf_iommu->max_banks;
- int max_cntrs = perf_iommu->max_counters;
+ int retval;
- raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+ raw_spin_lock_irqsave(&piommu->lock, flags);
for (bank = 0, shift = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
- if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+ if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
continue;
} else {
- perf_iommu->cntr_assign_mask |= (1ULL<<shift);
- retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+ piommu->cntr_assign_mask |= BIT_ULL(shift);
+ event->hw.iommu_bank = bank;
+ event->hw.iommu_cntr = cntr;
+ retval = 0;
goto out;
}
}
}
retval = -ENOSPC;
out:
- raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+ raw_spin_unlock_irqrestore(&piommu->lock, flags);
return retval;
}
@@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
static int perf_iommu_event_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- struct perf_amd_iommu *perf_iommu;
- u64 config, config1;
/* test the event attr type check for PMU enumeration */
if (event->attr.type != event->pmu->type)
@@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
- perf_iommu = &__perf_iommu;
-
- if (event->pmu != &perf_iommu->pmu)
- return -ENOENT;
-
- if (perf_iommu) {
- config = event->attr.config;
- config1 = event->attr.config1;
- } else {
- return -EINVAL;
- }
-
- /* integrate with iommu base devid (0000), assume one iommu */
- perf_iommu->max_banks =
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
- perf_iommu->max_counters =
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
- if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
- return -EINVAL;
-
/* update the hw_perf_event struct with the iommu config data */
- hwc->config = config;
- hwc->extra_reg.config = config1;
+ hwc->conf = event->attr.config;
+ hwc->conf1 = event->attr.config1;
return 0;
}
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+ return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
static void perf_iommu_enable_event(struct perf_event *ev)
{
- u8 csource = _GET_CSOURCE(ev);
- u16 devid = _GET_DEVID(ev);
+ struct amd_iommu *iommu = perf_event_2_iommu(ev);
+ struct hw_perf_event *hwc = &ev->hw;
+ u8 bank = hwc->iommu_bank;
+ u8 cntr = hwc->iommu_cntr;
u64 reg = 0ULL;
- reg = csource;
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+ reg = GET_CSOURCE(hwc);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
- reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+ reg = GET_DEVID_MASK(hwc);
+ reg = GET_DEVID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
- reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+ reg = GET_PASID_MASK(hwc);
+ reg = GET_PASID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_PASID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
- reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+ reg = GET_DOMID_MASK(hwc);
+ reg = GET_DOMID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
}
static void perf_iommu_disable_event(struct perf_event *event)
{
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+ struct hw_perf_event *hwc = &event->hw;
u64 reg = 0ULL;
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_SRC_REG, &reg);
}
static void perf_iommu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
- pr_debug("perf: amd_iommu:perf_iommu_start\n");
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
@@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags)
hwc->state = 0;
if (flags & PERF_EF_RELOAD) {
- u64 prev_raw_count = local64_read(&hwc->prev_count);
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+ u64 prev_raw_count = local64_read(&hwc->prev_count);
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &prev_raw_count);
}
perf_iommu_enable_event(event);
@@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags)
static void perf_iommu_read(struct perf_event *event)
{
- u64 count = 0ULL;
- u64 prev_raw_count = 0ULL;
- u64 delta = 0ULL;
+ u64 count, prev, delta;
struct hw_perf_event *hwc = &event->hw;
- pr_debug("perf: amd_iommu:perf_iommu_read\n");
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &count, false);
+ if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &count))
+ return;
/* IOMMU pc counter register is only 48 bits */
- count &= 0xFFFFFFFFFFFFULL;
+ count &= GENMASK_ULL(47, 0);
- prev_raw_count = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- count) != prev_raw_count)
+ prev = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
return;
- /* Handling 48-bit counter overflowing */
- delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+ /* Handle 48-bit counter overflow */
+ delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
-
}
static void perf_iommu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- pr_debug("perf: amd_iommu:perf_iommu_stop\n");
if (hwc->state & PERF_HES_UPTODATE)
return;
@@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
if (hwc->state & PERF_HES_UPTODATE)
return;
- config = hwc->config;
perf_iommu_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
@@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
static int perf_iommu_add(struct perf_event *event, int flags)
{
int retval;
- struct perf_amd_iommu *perf_iommu =
- container_of(event->pmu, struct perf_amd_iommu, pmu);
- pr_debug("perf: amd_iommu:perf_iommu_add\n");
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
/* request an iommu bank/counter */
- retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
- if (retval != -ENOSPC)
- event->hw.extra_reg.reg = (u16)retval;
- else
+ retval = get_next_avail_iommu_bnk_cntr(event);
+ if (retval)
return retval;
if (flags & PERF_EF_START)
@@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags)
static void perf_iommu_del(struct perf_event *event, int flags)
{
+ struct hw_perf_event *hwc = &event->hw;
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
- pr_debug("perf: amd_iommu:perf_iommu_del\n");
perf_iommu_stop(event, PERF_EF_UPDATE);
/* clear the assigned iommu bank/counter */
clear_avail_iommu_bnk_cntr(perf_iommu,
- _GET_BANK(event),
- _GET_CNTR(event));
+ hwc->iommu_bank, hwc->iommu_cntr);
perf_event_update_userpage(event);
}
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+static __init int _init_events_attrs(void)
{
- struct attribute **attrs;
- struct attribute_group *attr_group;
int i = 0, j;
+ struct attribute **attrs;
while (amd_iommu_v2_event_descs[i].attr.attr.name)
i++;
- attr_group = kzalloc(sizeof(struct attribute *)
- * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
- if (!attr_group)
+ attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL);
+ if (!attrs)
return -ENOMEM;
- attrs = (struct attribute **)(attr_group + 1);
for (j = 0; j < i; j++)
attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
- attr_group->name = "events";
- attr_group->attrs = attrs;
- perf_iommu->events_group = attr_group;
-
+ amd_iommu_events_group.attrs = attrs;
return 0;
}
-static __init void amd_iommu_pc_exit(void)
-{
- if (__perf_iommu.events_group != NULL) {
- kfree(__perf_iommu.events_group);
- __perf_iommu.events_group = NULL;
- }
-}
+const struct attribute_group *amd_iommu_attr_groups[] = {
+ &amd_iommu_format_group,
+ &amd_iommu_cpumask_group,
+ &amd_iommu_events_group,
+ NULL,
+};
+
+static struct pmu iommu_pmu = {
+ .event_init = perf_iommu_event_init,
+ .add = perf_iommu_add,
+ .del = perf_iommu_del,
+ .start = perf_iommu_start,
+ .stop = perf_iommu_stop,
+ .read = perf_iommu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_iommu_attr_groups,
+};
-static __init int _init_perf_amd_iommu(
- struct perf_amd_iommu *perf_iommu, char *name)
+static __init int init_one_iommu(unsigned int idx)
{
+ struct perf_amd_iommu *perf_iommu;
int ret;
+ perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+ if (!perf_iommu)
+ return -ENOMEM;
+
raw_spin_lock_init(&perf_iommu->lock);
- /* Init format attributes */
- perf_iommu->format_group = &amd_iommu_format_group;
+ perf_iommu->pmu = iommu_pmu;
+ perf_iommu->iommu = get_amd_iommu(idx);
+ perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx);
+ perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
- /* Init cpumask attributes to only core 0 */
- cpumask_set_cpu(0, &iommu_cpumask);
- perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
- /* Init events attributes */
- if (_init_events_attrs(perf_iommu) != 0)
- pr_err("perf: amd_iommu: Only support raw events.\n");
+ if (!perf_iommu->iommu ||
+ !perf_iommu->max_banks ||
+ !perf_iommu->max_counters) {
+ kfree(perf_iommu);
+ return -EINVAL;
+ }
- /* Init null attributes */
- perf_iommu->null_group = NULL;
- perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+ snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
- ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
- if (ret) {
- pr_err("perf: amd_iommu: Failed to initialized.\n");
- amd_iommu_pc_exit();
+ ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+ if (!ret) {
+ pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+ idx, perf_iommu->max_banks, perf_iommu->max_counters);
+ list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
} else {
- pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+ pr_warn("Error initializing IOMMU %d.\n", idx);
+ kfree(perf_iommu);
}
-
return ret;
}
-static struct perf_amd_iommu __perf_iommu = {
- .pmu = {
- .task_ctx_nr = perf_invalid_context,
- .event_init = perf_iommu_event_init,
- .add = perf_iommu_add,
- .del = perf_iommu_del,
- .start = perf_iommu_start,
- .stop = perf_iommu_stop,
- .read = perf_iommu_read,
- },
- .max_banks = 0x00,
- .max_counters = 0x00,
- .cntr_assign_mask = 0ULL,
- .format_group = NULL,
- .cpumask_group = NULL,
- .events_group = NULL,
- .null_group = NULL,
-};
-
static __init int amd_iommu_pc_init(void)
{
+ unsigned int i, cnt = 0;
+ int ret;
+
/* Make sure the IOMMU PC resource is available */
if (!amd_iommu_pc_supported())
return -ENODEV;
- _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+ ret = _init_events_attrs();
+ if (ret)
+ return ret;
+
+ /*
+ * An IOMMU PMU is specific to an IOMMU, and can function independently.
+ * So we go through all IOMMUs and ignore the one that fails init
+ * unless all IOMMU are failing.
+ */
+ for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+ ret = init_one_iommu(i);
+ if (!ret)
+ cnt++;
+ }
+
+ if (!cnt) {
+ kfree(amd_iommu_events_group.attrs);
+ return -ENODEV;
+ }
+ /* Init cpumask attributes to only core 0 */
+ cpumask_set_cpu(0, &iommu_cpumask);
return 0;
}
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 845d173278e3..62e0702c4374 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -24,17 +24,23 @@
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID 0x0000
+struct amd_iommu;
/* amd_iommu_init.c external support functions */
+extern int amd_iommu_get_num_iommus(void);
+
extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
- u8 fxn, u64 *value, bool is_write);
+extern struct amd_iommu *get_amd_iommu(int idx);
#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 4d1f7f2d9aff..ad44af0dd667 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -30,6 +30,9 @@
#define COUNTER_SHIFT 16
+#undef pr_fmt
+#define pr_fmt(fmt) "amd_uncore: " fmt
+
static int num_counters_llc;
static int num_counters_nb;
@@ -509,51 +512,34 @@ static int __init amd_uncore_init(void)
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- goto fail_nodev;
-
- switch(boot_cpu_data.x86) {
- case 23:
- /* Family 17h: */
- num_counters_nb = NUM_COUNTERS_NB;
- num_counters_llc = NUM_COUNTERS_L3;
- /*
- * For Family17h, the NorthBridge counters are
- * re-purposed as Data Fabric counters. Also, support is
- * added for L3 counters. The pmus are exported based on
- * family as either L2 or L3 and NB or DF.
- */
- amd_nb_pmu.name = "amd_df";
- amd_llc_pmu.name = "amd_l3";
- format_attr_event_df.show = &event_show_df;
- format_attr_event_l3.show = &event_show_l3;
- break;
- case 22:
- /* Family 16h - may change: */
- num_counters_nb = NUM_COUNTERS_NB;
- num_counters_llc = NUM_COUNTERS_L2;
- amd_nb_pmu.name = "amd_nb";
- amd_llc_pmu.name = "amd_l2";
- format_attr_event_df = format_attr_event;
- format_attr_event_l3 = format_attr_event;
- break;
- default:
- /*
- * All prior families have the same number of
- * NorthBridge and Last Level Cache counters
- */
- num_counters_nb = NUM_COUNTERS_NB;
- num_counters_llc = NUM_COUNTERS_L2;
- amd_nb_pmu.name = "amd_nb";
- amd_llc_pmu.name = "amd_l2";
- format_attr_event_df = format_attr_event;
- format_attr_event_l3 = format_attr_event;
- break;
- }
- amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
- amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
+ return -ENODEV;
if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
- goto fail_nodev;
+ return -ENODEV;
+
+ if (boot_cpu_data.x86 == 0x17) {
+ /*
+ * For F17h, the Northbridge counters are repurposed as Data
+ * Fabric counters. Also, L3 counters are supported too. The PMUs
+ * are exported based on family as either L2 or L3 and NB or DF.
+ */
+ num_counters_nb = NUM_COUNTERS_NB;
+ num_counters_llc = NUM_COUNTERS_L3;
+ amd_nb_pmu.name = "amd_df";
+ amd_llc_pmu.name = "amd_l3";
+ format_attr_event_df.show = &event_show_df;
+ format_attr_event_l3.show = &event_show_l3;
+ } else {
+ num_counters_nb = NUM_COUNTERS_NB;
+ num_counters_llc = NUM_COUNTERS_L2;
+ amd_nb_pmu.name = "amd_nb";
+ amd_llc_pmu.name = "amd_l2";
+ format_attr_event_df = format_attr_event;
+ format_attr_event_l3 = format_attr_event;
+ }
+
+ amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
+ amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
@@ -565,7 +551,7 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_nb;
- pr_info("perf: AMD NB counters detected\n");
+ pr_info("AMD NB counters detected\n");
ret = 0;
}
@@ -579,7 +565,7 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_llc;
- pr_info("perf: AMD LLC counters detected\n");
+ pr_info("AMD LLC counters detected\n");
ret = 0;
}
@@ -615,7 +601,6 @@ fail_nb:
if (amd_uncore_nb)
free_percpu(amd_uncore_nb);
-fail_nodev:
return ret;
}
device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 982c9e31daca..8ae8c5ce3a1f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -63,7 +63,6 @@ struct bts_buffer {
unsigned int cur_buf;
bool snapshot;
local_t data_size;
- local_t lost;
local_t head;
unsigned long end;
void **data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
return;
if (ds->bts_index >= ds->bts_absolute_maximum)
- local_inc(&buf->lost);
+ perf_aux_output_flag(&bts->handle,
+ PERF_AUX_FLAG_TRUNCATED);
/*
* old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
return;
fail_end_stop:
- perf_aux_output_end(&bts->handle, 0, false);
+ perf_aux_output_end(&bts->handle, 0);
fail_stop:
event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
bts->handle.head =
local_xchg(&buf->data_size,
buf->nr_pages << PAGE_SHIFT);
-
- perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
- !!local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&bts->handle,
+ local_xchg(&buf->data_size, 0));
}
cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
if (old_head == local_read(&buf->head))
return handled;
- perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
- !!local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
buf = perf_aux_output_begin(&bts->handle, event);
if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
* cleared handle::event
*/
barrier();
- perf_aux_output_end(&bts->handle, 0, false);
+ perf_aux_output_end(&bts->handle, 0);
}
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index eb1484c86bb4..a6d91d4e37a1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_glm),
+ EVENT_PTR(td_total_slots_scale_glm),
+ EVENT_PTR(td_fetch_bubbles_glm),
+ EVENT_PTR(td_recovery_bubbles_glm),
+ EVENT_PTR(td_slots_issued_glm),
+ EVENT_PTR(td_slots_retired_glm),
+ NULL
+};
+
static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
@@ -2130,7 +2151,7 @@ again:
* counters from the GLOBAL_STATUS mask and we always process PEBS
* events via drain_pebs().
*/
- status &= ~cpuc->pebs_enabled;
+ status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
/*
* PEBS overflow sets bit 62 in the global status register
@@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_prec_dist = true;
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.cpu_events = glm_events_attrs;
pr_cont("Goldmont events, ");
break;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9dfeeeca0ea8..c6d23ffe422d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1222,7 +1222,7 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
/* clear non-PEBS bit and re-check */
pebs_status = p->status & cpuc->pebs_enabled;
- pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+ pebs_status &= PEBS_COUNTER_MASK;
if (pebs_status == (1 << bit))
return at;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 81b321ace8e0..f924629836a8 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].to = msr_lastbranch.to;
cpuc->lbr_entries[i].mispred = 0;
cpuc->lbr_entries[i].predicted = 0;
+ cpuc->lbr_entries[i].in_tx = 0;
+ cpuc->lbr_entries[i].abort = 0;
+ cpuc->lbr_entries[i].cycles = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 5900471ee508..ae8324d65e61 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -28,6 +28,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/intel_pt.h>
+#include <asm/intel-family.h>
#include "../perf_event.h"
#include "pt.h"
@@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = {
.name = "caps",
};
+PMU_FORMAT_ATTR(pt, "config:0" );
PMU_FORMAT_ATTR(cyc, "config:1" );
PMU_FORMAT_ATTR(pwr_evt, "config:4" );
PMU_FORMAT_ATTR(fup_on_ptw, "config:5" );
@@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc, "config:9" );
PMU_FORMAT_ATTR(tsc, "config:10" );
PMU_FORMAT_ATTR(noretcomp, "config:11" );
PMU_FORMAT_ATTR(ptw, "config:12" );
+PMU_FORMAT_ATTR(branch, "config:13" );
PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
PMU_FORMAT_ATTR(psb_period, "config:24-27" );
static struct attribute *pt_formats_attr[] = {
+ &format_attr_pt.attr,
&format_attr_cyc.attr,
&format_attr_pwr_evt.attr,
&format_attr_fup_on_ptw.attr,
@@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = {
&format_attr_tsc.attr,
&format_attr_noretcomp.attr,
&format_attr_ptw.attr,
+ &format_attr_branch.attr,
&format_attr_mtc_period.attr,
&format_attr_cyc_thresh.attr,
&format_attr_psb_period.attr,
@@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void)
pt_pmu.tsc_art_den = eax;
}
+ /* model-specific quirks */
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+ /* not setting BRANCH_EN will #GP, erratum BDM106 */
+ pt_pmu.branch_en_always_on = true;
+ break;
+ default:
+ break;
+ }
+
if (boot_cpu_has(X86_FEATURE_VMX)) {
/*
* Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,8 +281,20 @@ fail:
#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
RTIT_CTL_FUP_ON_PTW)
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \
+ RTIT_CTL_TSC_EN | \
RTIT_CTL_DISRETC | \
+ RTIT_CTL_BRANCH_EN | \
RTIT_CTL_CYC_PSB | \
RTIT_CTL_MTC | \
RTIT_CTL_PWR_EVT_EN | \
@@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event)
return false;
}
+ /*
+ * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+ * clears the assomption that BranchEn must always be enabled,
+ * as was the case with the first implementation of PT.
+ * If this bit is not set, the legacy behavior is preserved
+ * for compatibility with the older userspace.
+ *
+ * Re-using bit 0 for this purpose is fine because it is never
+ * directly set by the user; previous attempts at setting it in
+ * the attr.config resulted in -EINVAL.
+ */
+ if (config & RTIT_CTL_PASSTHROUGH) {
+ /*
+ * Disallow not setting BRANCH_EN where BRANCH_EN is
+ * always required.
+ */
+ if (pt_pmu.branch_en_always_on &&
+ !(config & RTIT_CTL_BRANCH_EN))
+ return false;
+ } else {
+ /*
+ * Disallow BRANCH_EN without the PASSTHROUGH.
+ */
+ if (config & RTIT_CTL_BRANCH_EN)
+ return false;
+ }
+
return true;
}
@@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event)
static void pt_config(struct perf_event *event)
{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
if (!event->hw.itrace_started) {
@@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event)
}
reg = pt_config_filters(event);
- reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+ reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+
+ /*
+ * Previously, we had BRANCH_EN on by default, but now that PT has
+ * grown features outside of branch tracing, it is useful to allow
+ * the user to disable it. Setting bit 0 in the event's attr.config
+ * allows BRANCH_EN to pass through instead of being always on. See
+ * also the comment in pt_event_valid().
+ */
+ if (event->attr.config & BIT(0)) {
+ reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+ } else {
+ reg |= RTIT_CTL_BRANCH_EN;
+ }
if (!event->attr.exclude_kernel)
reg |= RTIT_CTL_OS;
@@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.config = reg;
- wrmsrl(MSR_IA32_RTIT_CTL, reg);
+ if (READ_ONCE(pt->vmx_on))
+ perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+ else
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
}
static void pt_config_stop(struct perf_event *event)
{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 ctl = READ_ONCE(event->hw.config);
/* may be already stopped by a PMI */
@@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event)
return;
ctl &= ~RTIT_CTL_TRACEEN;
- wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (!READ_ONCE(pt->vmx_on))
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
WRITE_ONCE(event->hw.config, ctl);
@@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt)
*/
if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
- local_inc(&buf->lost);
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_TRUNCATED);
advance++;
}
}
@@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 <
- sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
return -EINVAL;
+ }
/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void)
if (!READ_ONCE(pt->handle_nmi))
return;
- /*
- * If VMX is on and PT does not support it, don't touch anything.
- */
- if (READ_ONCE(pt->vmx_on))
- return;
-
if (!event)
return;
@@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void)
pt_update_head(pt);
- perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
- local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
if (!event->hw.state) {
int ret;
@@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void)
/* snapshot counters don't use PMI, so it's safe */
ret = pt_buffer_reset_markers(buf, &pt->handle);
if (ret) {
- perf_aux_output_end(&pt->handle, 0, true);
+ perf_aux_output_end(&pt->handle, 0);
return;
}
@@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on)
local_irq_save(flags);
WRITE_ONCE(pt->vmx_on, on);
- if (on) {
- /* prevent pt_config_stop() from writing RTIT_CTL */
- event = pt->handle.event;
- if (event)
- event->hw.config = 0;
- }
+ /*
+ * If an AUX transaction is in progress, it will contain
+ * gap(s), so flag it PARTIAL to inform the user.
+ */
+ event = pt->handle.event;
+ if (event)
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_PARTIAL);
+
+ /* Turn PTs back on */
+ if (!on && event)
+ wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
@@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf;
- if (READ_ONCE(pt->vmx_on))
- return;
-
buf = perf_aux_output_begin(&pt->handle, event);
if (!buf)
goto fail_stop;
@@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode)
return;
fail_end_stop:
- perf_aux_output_end(&pt->handle, 0, true);
+ perf_aux_output_end(&pt->handle, 0);
fail_stop:
hwc->state = PERF_HES_STOPPED;
}
@@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
pt->handle.head =
local_xchg(&buf->data_size,
buf->nr_pages << PAGE_SHIFT);
- perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
- local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
}
}
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 53473c21b554..0eb41d07b79a 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -110,6 +110,7 @@ struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
bool vmx;
+ bool branch_en_always_on;
unsigned long max_nonturbo_ratio;
unsigned int tsc_art_num;
unsigned int tsc_art_den;
@@ -143,7 +144,6 @@ struct pt_buffer {
size_t output_off;
unsigned long nr_pages;
local_t data_size;
- local_t lost;
local64_t head;
bool snapshot;
unsigned long stop_pos, intr_pos;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index bcbb1d2ae10b..be3d36254040 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -79,6 +79,7 @@ struct amd_nb {
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
+#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/*
* Flags PEBS can handle without an PMI.
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 8bef70e7f3cc..2b01421f7d0f 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -27,45 +27,22 @@
#include <linux/clockchips.h>
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_HYPERV_TSCPAGE
static struct ms_hyperv_tsc_page *tsc_pg;
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return tsc_pg;
+}
+
static u64 read_hv_clock_tsc(struct clocksource *arg)
{
- u64 current_tick;
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick == U64_MAX)
+ rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
- if (tsc_pg->tsc_sequence != 0) {
- /*
- * Use the tsc page to compute the value.
- */
-
- while (1) {
- u64 tmp;
- u32 sequence = tsc_pg->tsc_sequence;
- u64 cur_tsc;
- u64 scale = tsc_pg->tsc_scale;
- s64 offset = tsc_pg->tsc_offset;
-
- rdtscll(cur_tsc);
- /* current_tick = ((cur_tsc *scale) >> 64) + offset */
- asm("mulq %3"
- : "=d" (current_tick), "=a" (tmp)
- : "a" (cur_tsc), "r" (scale));
-
- current_tick += offset;
- if (tsc_pg->tsc_sequence == sequence)
- return current_tick;
-
- if (tsc_pg->tsc_sequence != 0)
- continue;
- /*
- * Fallback using MSR method.
- */
- break;
- }
- }
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
return current_tick;
}
@@ -139,7 +116,7 @@ void hyperv_init(void)
/*
* Register Hyper-V specific clocksource.
*/
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_HYPERV_TSCPAGE
if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
union hv_x64_msr_hypercall_contents tsc_msr;
@@ -155,6 +132,9 @@ void hyperv_init(void)
tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg);
wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+ hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK;
+
clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
return;
}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 730ef65e8393..bdffcd9eab2b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,12 +252,6 @@ static inline int x2apic_enabled(void) { return 0; }
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
-#ifdef CONFIG_X86_64
-#define SET_APIC_ID(x) (apic->set_apic_id(x))
-#else
-
-#endif
-
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
@@ -299,6 +293,7 @@ struct apic {
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
unsigned int (*get_apic_id)(unsigned long x);
+ /* Can't be NULL on 64-bit */
unsigned long (*set_apic_id)(unsigned int id);
int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5ea025..caa5798c92f4 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
return cmpxchg(&v->counter, old, new);
}
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
static inline int atomic_xchg(atomic_t *v, int new)
{
return xchg(&v->counter, new);
@@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v) \
}
#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
+static inline int atomic_fetch_##op(int i, atomic_t *v) \
{ \
- int old, val = atomic_read(v); \
- for (;;) { \
- old = atomic_cmpxchg(v, val, val c_op i); \
- if (old == val) \
- break; \
- val = old; \
- } \
- return old; \
+ int val = atomic_read(v); \
+ do { \
+ } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \
+ return val; \
}
#define ATOMIC_OPS(op, c_op) \
@@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^)
*/
static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
+ int c = atomic_read(v);
+ do {
+ if (unlikely(c == u))
break;
- c = old;
- }
+ } while (!atomic_try_cmpxchg(v, &c, c + a));
return c;
}
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6ae2f7..6189a433c9a9 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
return cmpxchg(&v->counter, old, new);
}
+#define atomic64_try_cmpxchg atomic64_try_cmpxchg
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
static inline long atomic64_xchg(atomic64_t *v, long new)
{
return xchg(&v->counter, new);
@@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
*/
static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
{
- long c, old;
- c = atomic64_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic64_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
+ long c = atomic64_read(v);
+ do {
+ if (unlikely(c == u))
+ return false;
+ } while (!atomic64_try_cmpxchg(v, &c, c + a));
+ return true;
}
#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
@@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
*/
static inline long atomic64_dec_if_positive(atomic64_t *v)
{
- long c, old, dec;
- c = atomic64_read(v);
- for (;;) {
+ long dec, c = atomic64_read(v);
+ do {
dec = c - 1;
if (unlikely(dec < 0))
break;
- old = atomic64_cmpxchg((v), c, dec);
- if (likely(old == c))
- break;
- c = old;
- }
+ } while (!atomic64_try_cmpxchg(v, &c, dec));
return dec;
}
@@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v) \
#define ATOMIC64_FETCH_OP(op, c_op) \
static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
{ \
- long old, val = atomic64_read(v); \
- for (;;) { \
- old = atomic64_cmpxchg(v, val, val c_op i); \
- if (old == val) \
- break; \
- val = old; \
- } \
- return old; \
+ long val = atomic64_read(v); \
+ do { \
+ } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \
+ return val; \
}
#define ATOMIC64_OPS(op, c_op) \
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index ba38ebbaced3..39e702d90cdb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -1,36 +1,82 @@
#ifndef _ASM_X86_BUG_H
#define _ASM_X86_BUG_H
-#define HAVE_ARCH_BUG
+#include <linux/stringify.h>
-#ifdef CONFIG_DEBUG_BUGVERBOSE
+/*
+ * Since some emulators terminate on UD2, we cannot use it for WARN.
+ * Since various instruction decoders disagree on the length of UD1,
+ * we cannot use it either. So use UD0 for WARN.
+ *
+ * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas
+ * our kernel decoder thinks it takes a ModRM byte, which seems consistent
+ * with various things like the Intel SDM instruction encoding rules)
+ */
+
+#define ASM_UD0 ".byte 0x0f, 0xff"
+#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
+#define ASM_UD2 ".byte 0x0f, 0x0b"
+
+#define INSN_UD0 0xff0f
+#define INSN_UD2 0x0b0f
+
+#define LEN_UD0 2
+
+#ifdef CONFIG_GENERIC_BUG
#ifdef CONFIG_X86_32
-# define __BUG_C0 "2:\t.long 1b, %c0\n"
+# define __BUG_REL(val) ".long " __stringify(val)
#else
-# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
+# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
#endif
-#define BUG() \
-do { \
- asm volatile("1:\tud2\n" \
- ".pushsection __bug_table,\"a\"\n" \
- __BUG_C0 \
- "\t.word %c1, 0\n" \
- "\t.org 2b+%c2\n" \
- ".popsection" \
- : : "i" (__FILE__), "i" (__LINE__), \
- "i" (sizeof(struct bug_entry))); \
- unreachable(); \
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \
+ "\t.word %c1" "\t# bug_entry::line\n" \
+ "\t.word %c2" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c3\n" \
+ ".popsection" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
} while (0)
+#else /* !CONFIG_DEBUG_BUGVERBOSE */
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t.word %c0" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c1\n" \
+ ".popsection" \
+ : : "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
+} while (0)
+
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
#else
+
+#define _BUG_FLAGS(ins, flags) asm volatile(ins)
+
+#endif /* CONFIG_GENERIC_BUG */
+
+#define HAVE_ARCH_BUG
#define BUG() \
do { \
- asm volatile("ud2"); \
+ _BUG_FLAGS(ASM_UD2, 0); \
unreachable(); \
} while (0)
-#endif
+
+#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
#include <asm-generic/bug.h>
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index eae33c7170c8..47bea8cadbd0 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -6,7 +6,8 @@
#define VCLOCK_NONE 0 /* No vDSO clock available. */
#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */
-#define VCLOCK_MAX 2
+#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */
+#define VCLOCK_MAX 3
struct arch_clocksource_data {
int vclock_mode;
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 97848cdfcb1a..d90296d061e8 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -153,6 +153,76 @@ extern void __add_wrong_size(void)
#define cmpxchg_local(ptr, old, new) \
__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
+({ \
+ bool success; \
+ __typeof__(_ptr) _old = (_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(_ptr); \
+ asm volatile(lock "cmpxchgb %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "q" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(_ptr); \
+ asm volatile(lock "cmpxchgw %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(_ptr); \
+ asm volatile(lock "cmpxchgl %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(_ptr); \
+ asm volatile(lock "cmpxchgq %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); \
+})
+
+#define __try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new) \
+ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
* value of "*ptr".
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0fe00446f9ca..2701e5f8145b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -202,6 +202,8 @@
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index d4d3ed456cb7..e8ab9a46bc68 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -287,7 +287,7 @@ struct task_struct;
#define ARCH_DLINFO_IA32 \
do { \
- if (vdso32_enabled) { \
+ if (VDSO_CURRENT_BASE) { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 9814db42b790..75b748a1deb8 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -12,6 +12,7 @@
*/
#define INTEL_FAM6_CORE_YONAH 0x0E
+
#define INTEL_FAM6_CORE2_MEROM 0x0F
#define INTEL_FAM6_CORE2_MEROM_L 0x16
#define INTEL_FAM6_CORE2_PENRYN 0x17
@@ -21,6 +22,7 @@
#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
#define INTEL_FAM6_NEHALEM_EP 0x1A
#define INTEL_FAM6_NEHALEM_EX 0x2E
+
#define INTEL_FAM6_WESTMERE 0x25
#define INTEL_FAM6_WESTMERE_EP 0x2C
#define INTEL_FAM6_WESTMERE_EX 0x2F
@@ -36,9 +38,9 @@
#define INTEL_FAM6_HASWELL_GT3E 0x46
#define INTEL_FAM6_BROADWELL_CORE 0x3D
-#define INTEL_FAM6_BROADWELL_XEON_D 0x56
#define INTEL_FAM6_BROADWELL_GT3E 0x47
#define INTEL_FAM6_BROADWELL_X 0x4F
+#define INTEL_FAM6_BROADWELL_XEON_D 0x56
#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E
#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E
@@ -59,8 +61,8 @@
#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
-#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
+#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
/* Xeon Phi */
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 0d64397cee58..597dc4995678 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -12,6 +12,7 @@
#define IA32_L3_QOS_CFG 0xc81
#define IA32_L3_CBM_BASE 0xc90
#define IA32_L2_CBM_BASE 0xd10
+#define IA32_MBA_THRTL_BASE 0xd50
#define L3_QOS_CDP_ENABLE 0x01ULL
@@ -37,23 +38,30 @@ struct rdtgroup {
/* rdtgroup.flags */
#define RDT_DELETED 1
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
/* List of all resource groups */
extern struct list_head rdt_all_groups;
+extern int max_name_width, max_data_width;
+
int __init rdtgroup_init(void);
/**
* struct rftype - describe each file in the resctrl file system
- * @name: file name
- * @mode: access mode
- * @kf_ops: operations
- * @seq_show: show content of the file
- * @write: write to the file
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
*/
struct rftype {
char *name;
umode_t mode;
struct kernfs_ops *kf_ops;
+ unsigned long flags;
int (*seq_show)(struct kernfs_open_file *of,
struct seq_file *sf, void *v);
@@ -67,54 +75,21 @@ struct rftype {
};
/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled: Is this feature enabled on this machine
- * @capable: Is this feature available on this machine
- * @name: Name to use in "schemata" file
- * @num_closid: Number of CLOSIDs available
- * @max_cbm: Largest Cache Bit Mask allowed
- * @min_cbm_bits: Minimum number of consecutive bits to be set
- * in a cache bit mask
- * @domains: All domains for this resource
- * @num_domains: Number of domains active
- * @msr_base: Base MSR address for CBMs
- * @tmp_cbms: Scratch space when updating schemata
- * @num_tmp_cbms: Number of CBMs in tmp_cbms
- * @cache_level: Which cache level defines scope of this domain
- * @cbm_idx_multi: Multiplier of CBM index
- * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
- * closid * cbm_idx_multi + cbm_idx_offset
- */
-struct rdt_resource {
- bool enabled;
- bool capable;
- char *name;
- int num_closid;
- int cbm_len;
- int min_cbm_bits;
- u32 max_cbm;
- struct list_head domains;
- int num_domains;
- int msr_base;
- u32 *tmp_cbms;
- int num_tmp_cbms;
- int cache_level;
- int cbm_idx_multi;
- int cbm_idx_offset;
-};
-
-/**
* struct rdt_domain - group of cpus sharing an RDT resource
* @list: all instances of this resource
* @id: unique id for this instance
* @cpu_mask: which cpus share this resource
- * @cbm: array of cache bit masks (indexed by CLOSID)
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl: new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
*/
struct rdt_domain {
struct list_head list;
int id;
struct cpumask cpu_mask;
- u32 *cbm;
+ u32 *ctrl_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
};
/**
@@ -129,6 +104,83 @@ struct msr_param {
int high;
};
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len: Length of the cache bit mask
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @cbm_idx_mult: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ * in a cache bit mask
+ */
+struct rdt_cache {
+ unsigned int cbm_len;
+ unsigned int min_cbm_bits;
+ unsigned int cbm_idx_mult;
+ unsigned int cbm_idx_offset;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay: Max throttle delay. Delay is the hardware
+ * representation for memory bandwidth.
+ * @min_bw: Minimum memory bandwidth percentage user can request
+ * @bw_gran: Granularity at which the memory bandwidth is allocated
+ * @delay_linear: True if memory B/W delay is in linear scale
+ * @mb_map: Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+ u32 max_delay;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ u32 *mb_map;
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled: Is this feature enabled on this machine
+ * @capable: Is this feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @cache_level: Which cache level defines scope of this resource
+ * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @data_width: Character width of data when displaying
+ * @domains: All domains for this resource
+ * @cache: Cache allocation related data
+ * @info_files: resctrl info files for the resource
+ * @nr_info_files: Number of info files
+ * @format_str: Per resource format string to show domain value
+ * @parse_ctrlval: Per resource function pointer to parse control values
+ */
+struct rdt_resource {
+ bool enabled;
+ bool capable;
+ char *name;
+ int num_closid;
+ int cache_level;
+ u32 default_ctrl;
+ unsigned int msr_base;
+ void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
+ struct rdt_resource *r);
+ int data_width;
+ struct list_head domains;
+ struct rdt_cache cache;
+ struct rdt_membw membw;
+ struct rftype *info_files;
+ int nr_info_files;
+ const char *format_str;
+ int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
+ struct rdt_domain *d);
+};
+
+void rdt_get_cache_infofile(struct rdt_resource *r);
+void rdt_get_mba_infofile(struct rdt_resource *r);
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+
extern struct mutex rdtgroup_mutex;
extern struct rdt_resource rdt_resources_all[];
@@ -142,6 +194,7 @@ enum {
RDT_RESOURCE_L3DATA,
RDT_RESOURCE_L3CODE,
RDT_RESOURCE_L2,
+ RDT_RESOURCE_MBA,
/* Must be the last */
RDT_NUM_RESOURCES,
@@ -149,7 +202,7 @@ enum {
#define for_each_capable_rdt_resource(r) \
for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ r++) \
if (r->capable)
#define for_each_enabled_rdt_resource(r) \
@@ -165,8 +218,16 @@ union cpuid_0x10_1_eax {
unsigned int full;
};
-/* CPUID.(EAX=10H, ECX=ResID=1).EDX */
-union cpuid_0x10_1_edx {
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
struct {
unsigned int cos_max:16;
} split;
@@ -175,7 +236,7 @@ union cpuid_0x10_1_edx {
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-void rdt_cbm_update(void *arg);
+void rdt_ctrl_update(void *arg);
struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
void rdtgroup_kn_unlock(struct kernfs_node *kn);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 200581691c6e..34b984c60790 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -72,14 +72,13 @@ struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = -1: This instruction type is not boostable.
- * boostable = 0: This instruction type is boostable.
- * boostable = 1: This instruction has been boosted: we have
+ * boostable = false: This instruction type is not boostable.
+ * boostable = true: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
* a post_handler or break_handler).
*/
- int boostable;
+ bool boostable;
bool if_modifier;
};
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e63873683d4a..4fd5195deed0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -128,7 +128,7 @@
* debugging tools. Each entry is only valid when its finished flag
* is set.
*/
-struct mce_log {
+struct mce_log_buffer {
char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = MCE_LOG_LEN */
unsigned next;
@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios {
- MCE_PRIO_SRAO = INT_MAX,
- MCE_PRIO_EXTLOG = INT_MAX - 1,
- MCE_PRIO_NFIT = INT_MAX - 2,
- MCE_PRIO_EDAC = INT_MAX - 3,
+ MCE_PRIO_FIRST = INT_MAX,
+ MCE_PRIO_SRAO = INT_MAX - 1,
+ MCE_PRIO_EXTLOG = INT_MAX - 2,
+ MCE_PRIO_NFIT = INT_MAX - 3,
+ MCE_PRIO_EDAC = INT_MAX - 4,
+ MCE_PRIO_MCELOG = 1,
MCE_PRIO_LOWEST = 0,
};
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 7c9c895432a9..fba100713924 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -176,4 +176,58 @@ void hyperv_report_panic(struct pt_regs *regs);
bool hv_is_hypercall_page_setup(void);
void hyperv_cleanup(void);
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+ u64 scale, offset, cur_tsc;
+ u32 sequence;
+
+ /*
+ * The protocol for reading Hyper-V TSC page is specified in Hypervisor
+ * Top-Level Functional Specification ver. 3.0 and above. To get the
+ * reference time we must do the following:
+ * - READ ReferenceTscSequence
+ * A special '0' value indicates the time source is unreliable and we
+ * need to use something else. The currently published specification
+ * versions (up to 4.0b) contain a mistake and wrongly claim '-1'
+ * instead of '0' as the special value, see commit c35b82ef0294.
+ * - ReferenceTime =
+ * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset
+ * - READ ReferenceTscSequence again. In case its value has changed
+ * since our first reading we need to discard ReferenceTime and repeat
+ * the whole sequence as the hypervisor was updating the page in
+ * between.
+ */
+ do {
+ sequence = READ_ONCE(tsc_pg->tsc_sequence);
+ if (!sequence)
+ return U64_MAX;
+ /*
+ * Make sure we read sequence before we read other values from
+ * TSC page.
+ */
+ smp_rmb();
+
+ scale = READ_ONCE(tsc_pg->tsc_scale);
+ offset = READ_ONCE(tsc_pg->tsc_offset);
+ cur_tsc = rdtsc_ordered();
+
+ /*
+ * Make sure we read sequence after we read all other values
+ * from TSC page.
+ */
+ smp_rmb();
+
+ } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
+
+ return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
+}
+
+#else
+static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return NULL;
+}
+#endif
#endif
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b3bebf9e5746..b4a0d43248cf 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,6 +4,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
@@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
-void clear_page(void *page);
+void clear_page_orig(void *page);
+void clear_page_rep(void *page);
+void clear_page_erms(void *page);
+
+static inline void clear_page(void *page)
+{
+ alternative_call_2(clear_page_orig,
+ clear_page_rep, X86_FEATURE_REP_GOOD,
+ clear_page_erms, X86_FEATURE_ERMS,
+ "=D" (page),
+ "0" (page)
+ : "memory", "rax", "rcx");
+}
+
void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 2c1ebeb4d737..529bb4a6487a 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
- * instruction.
+ * instruction. Note that @size is internally rounded up to be cache
+ * line size aligned.
*/
static inline void arch_wb_cache_pmem(void *addr, size_t size)
{
@@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size)
clwb(p);
}
-/*
- * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
- * iterators, so for other types (bvec & kvec) we must do a cache write-back.
- */
-static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
-{
- return iter_is_iovec(i) == false;
-}
-
/**
* arch_copy_from_iter_pmem - copy data from an iterator to PMEM
* @addr: PMEM destination address
@@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
/* TODO: skip the write-back by always using non-temporal stores */
len = copy_from_iter_nocache(addr, bytes, i);
- if (__iter_needs_pmem_wb(i))
+ /*
+ * In the iovec case on x86_64 copy_from_iter_nocache() uses
+ * non-temporal stores for the bulk of the transfer, but we need
+ * to manually flush if the transfer is unaligned. A cached
+ * memory copy is used when destination or size is not naturally
+ * aligned. That is:
+ * - Require 8-byte alignment when size is 8 bytes or larger.
+ * - Require 4-byte alignment when size is 4 bytes.
+ *
+ * In the non-iovec case the entire destination needs to be
+ * flushed.
+ */
+ if (iter_is_iovec(i)) {
+ unsigned long flushed, dest = (unsigned long) addr;
+
+ if (bytes < 8) {
+ if (!IS_ALIGNED(dest, 4) || (bytes != 4))
+ arch_wb_cache_pmem(addr, 1);
+ } else {
+ if (!IS_ALIGNED(dest, 8)) {
+ dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
+ arch_wb_cache_pmem(addr, 1);
+ }
+
+ flushed = dest - (unsigned long) addr;
+ if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
+ arch_wb_cache_pmem(addr + bytes - 1, 1);
+ }
+ } else
arch_wb_cache_pmem(addr, bytes);
return len;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 2cb1cc253d51..fc62ba8dce93 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
};
extern struct machine_ops machine_ops;
+extern int crashing_cpu;
void native_machine_crash_shutdown(struct pt_regs *regs);
void native_machine_shutdown(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 026ea82ecc60..47103eca3775 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -149,6 +149,19 @@ void smp_store_cpu_info(int id);
#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
#define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu)
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() (this_cpu_read(cpu_number))
+
+#ifdef CONFIG_X86_32
+extern int safe_smp_processor_id(void);
+#else
+# define safe_smp_processor_id() smp_processor_id()
+#endif
+
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
static inline int wbinvd_on_all_cpus(void)
@@ -161,22 +174,6 @@ static inline int wbinvd_on_all_cpus(void)
extern unsigned disabled_cpus;
-#ifdef CONFIG_X86_32_SMP
-/*
- * This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
- */
-#define raw_smp_processor_id() (this_cpu_read(cpu_number))
-extern int safe_smp_processor_id(void);
-
-#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() (this_cpu_read(cpu_number))
-
-#define safe_smp_processor_id() smp_processor_id()
-
-#endif
-
#ifdef CONFIG_X86_LOCAL_APIC
#ifndef CONFIG_X86_64
@@ -191,11 +188,7 @@ static inline int logical_smp_processor_id(void)
extern int hard_smp_processor_id(void);
#else /* CONFIG_X86_LOCAL_APIC */
-
-# ifndef CONFIG_SMP
-# define hard_smp_processor_id() 0
-# endif
-
+#define hard_smp_processor_id() 0
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ea148313570f..68766b276d9e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -3,19 +3,14 @@
/*
* User space memory access functions
*/
-#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
-#include <linux/thread_info.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
/*
* The fs value determines whether argument validity checking should be
* performed or not. If get_fs() == USER_DS, checking is performed, with
@@ -384,6 +379,18 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
+#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("\n" \
+ "1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
/*
* This doesn't do __uaccess_begin/end - the exception handling
* around it must do that.
@@ -675,59 +682,6 @@ extern struct movsl_mask {
# include <asm/uaccess_64.h>
#endif
-unsigned long __must_check _copy_from_user(void *to, const void __user *from,
- unsigned n);
-unsigned long __must_check _copy_to_user(void __user *to, const void *from,
- unsigned n);
-
-extern void __compiletime_error("usercopy buffer size is too small")
-__bad_copy_user(void);
-
-static inline void copy_user_overflow(int size, unsigned long count)
-{
- WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
-
-static __always_inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
- int sz = __compiletime_object_size(to);
-
- might_fault();
-
- kasan_check_write(to, n);
-
- if (likely(sz < 0 || sz >= n)) {
- check_object_size(to, n, false);
- n = _copy_from_user(to, from, n);
- } else if (!__builtin_constant_p(n))
- copy_user_overflow(sz, n);
- else
- __bad_copy_user();
-
- return n;
-}
-
-static __always_inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
- int sz = __compiletime_object_size(from);
-
- kasan_check_read(from, n);
-
- might_fault();
-
- if (likely(sz < 0 || sz >= n)) {
- check_object_size(from, n, true);
- n = _copy_to_user(to, from, n);
- } else if (!__builtin_constant_p(n))
- copy_user_overflow(sz, n);
- else
- __bad_copy_user();
-
- return n;
-}
-
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 7d3bdd1ed697..aeda9bb8af50 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -4,149 +4,52 @@
/*
* User space memory access functions
*/
-#include <linux/errno.h>
-#include <linux/thread_info.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
-unsigned long __must_check __copy_to_user_ll
- (void __user *to, const void *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll
- (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nozero
- (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nocache
- (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_user_ll
+ (void *to, const void *from, unsigned long n);
unsigned long __must_check __copy_from_user_ll_nocache_nozero
(void *to, const void __user *from, unsigned long n);
-/**
- * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only.
- *
- * Copy data from kernel space to user space. Caller must check
- * the specified block with access_ok() before calling this function.
- * The caller should also make sure he pins the user space address
- * so that we don't result in page fault and sleep.
- */
-static __always_inline unsigned long __must_check
-__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
-{
- check_object_size(from, n, true);
- return __copy_to_user_ll(to, from, n);
-}
-
-/**
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from kernel space to user space. Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
static __always_inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
- might_fault();
- return __copy_to_user_inatomic(to, from, n);
+ return __copy_user_ll((__force void *)to, from, n);
}
static __always_inline unsigned long
-__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
-{
- return __copy_from_user_ll_nozero(to, from, n);
-}
-
-/**
- * __copy_from_user: - Copy a block of data from user space, with less checking.
- * @to: Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from user space to kernel space. Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- *
- * An alternate version - __copy_from_user_inatomic() - may be called from
- * atomic context and will fail rather than sleep. In this case the
- * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
- * for explanation of why this is needed.
- */
-static __always_inline unsigned long
-__copy_from_user(void *to, const void __user *from, unsigned long n)
-{
- might_fault();
- check_object_size(to, n, false);
- if (__builtin_constant_p(n)) {
- unsigned long ret;
-
- switch (n) {
- case 1:
- __uaccess_begin();
- __get_user_size(*(u8 *)to, from, 1, ret, 1);
- __uaccess_end();
- return ret;
- case 2:
- __uaccess_begin();
- __get_user_size(*(u16 *)to, from, 2, ret, 2);
- __uaccess_end();
- return ret;
- case 4:
- __uaccess_begin();
- __get_user_size(*(u32 *)to, from, 4, ret, 4);
- __uaccess_end();
- return ret;
- }
- }
- return __copy_from_user_ll(to, from, n);
-}
-
-static __always_inline unsigned long __copy_from_user_nocache(void *to,
- const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
- might_fault();
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
+ ret = 0;
__uaccess_begin();
- __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ __get_user_asm_nozero(*(u8 *)to, from, ret,
+ "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
+ ret = 0;
__uaccess_begin();
- __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ __get_user_asm_nozero(*(u16 *)to, from, ret,
+ "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
+ ret = 0;
__uaccess_begin();
- __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ __get_user_asm_nozero(*(u32 *)to, from, ret,
+ "l", "k", "=r", 4);
__uaccess_end();
return ret;
}
}
- return __copy_from_user_ll_nocache(to, from, n);
+ return __copy_user_ll(to, (__force const void *)from, n);
}
static __always_inline unsigned long
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 673059a109fe..c5504b9a472e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -5,7 +5,6 @@
* User space memory access functions
*/
#include <linux/compiler.h>
-#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
@@ -46,58 +45,54 @@ copy_user_generic(void *to, const void *from, unsigned len)
return ret;
}
-__must_check unsigned long
-copy_in_user(void __user *to, const void __user *from, unsigned len);
-
-static __always_inline __must_check
-int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
+static __always_inline __must_check unsigned long
+raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
int ret = 0;
- check_object_size(dst, size, false);
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
case 1:
__uaccess_begin();
- __get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
__uaccess_begin();
- __get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
__uaccess_begin();
- __get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
case 8:
__uaccess_begin();
- __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
case 10:
__uaccess_begin();
- __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
if (likely(!ret))
- __get_user_asm(*(u16 *)(8 + (char *)dst),
+ __get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
(u16 __user *)(8 + (char __user *)src),
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 16:
__uaccess_begin();
- __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
if (likely(!ret))
- __get_user_asm(*(u64 *)(8 + (char *)dst),
+ __get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
(u64 __user *)(8 + (char __user *)src),
ret, "q", "", "=r", 8);
__uaccess_end();
@@ -107,20 +102,11 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
}
}
-static __always_inline __must_check
-int __copy_from_user(void *dst, const void __user *src, unsigned size)
-{
- might_fault();
- kasan_check_write(dst, size);
- return __copy_from_user_nocheck(dst, src, size);
-}
-
-static __always_inline __must_check
-int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
+static __always_inline __must_check unsigned long
+raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
int ret = 0;
- check_object_size(src, size, true);
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
@@ -176,100 +162,16 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
}
static __always_inline __must_check
-int __copy_to_user(void __user *dst, const void *src, unsigned size)
-{
- might_fault();
- kasan_check_read(src, size);
- return __copy_to_user_nocheck(dst, src, size);
-}
-
-static __always_inline __must_check
-int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
-{
- int ret = 0;
-
- might_fault();
- if (!__builtin_constant_p(size))
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
- switch (size) {
- case 1: {
- u8 tmp;
- __uaccess_begin();
- __get_user_asm(tmp, (u8 __user *)src,
- ret, "b", "b", "=q", 1);
- if (likely(!ret))
- __put_user_asm(tmp, (u8 __user *)dst,
- ret, "b", "b", "iq", 1);
- __uaccess_end();
- return ret;
- }
- case 2: {
- u16 tmp;
- __uaccess_begin();
- __get_user_asm(tmp, (u16 __user *)src,
- ret, "w", "w", "=r", 2);
- if (likely(!ret))
- __put_user_asm(tmp, (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
- __uaccess_end();
- return ret;
- }
-
- case 4: {
- u32 tmp;
- __uaccess_begin();
- __get_user_asm(tmp, (u32 __user *)src,
- ret, "l", "k", "=r", 4);
- if (likely(!ret))
- __put_user_asm(tmp, (u32 __user *)dst,
- ret, "l", "k", "ir", 4);
- __uaccess_end();
- return ret;
- }
- case 8: {
- u64 tmp;
- __uaccess_begin();
- __get_user_asm(tmp, (u64 __user *)src,
- ret, "q", "", "=r", 8);
- if (likely(!ret))
- __put_user_asm(tmp, (u64 __user *)dst,
- ret, "q", "", "er", 8);
- __uaccess_end();
- return ret;
- }
- default:
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
- }
-}
-
-static __must_check __always_inline int
-__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
-{
- kasan_check_write(dst, size);
- return __copy_from_user_nocheck(dst, src, size);
-}
-
-static __must_check __always_inline int
-__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
{
- kasan_check_read(src, size);
- return __copy_to_user_nocheck(dst, src, size);
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
}
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
static inline int
-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
-{
- might_fault();
- kasan_check_write(dst, size);
- return __copy_user_nocache(dst, src, size, 1);
-}
-
-static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
{
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 6fa75b17aec3..9b10dcd51716 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,8 +12,10 @@ struct unwind_state {
struct task_struct *task;
int graph_idx;
#ifdef CONFIG_FRAME_POINTER
+ bool got_irq;
unsigned long *bp, *orig_sp;
struct pt_regs *regs;
+ unsigned long ip;
#else
unsigned long *sp;
#endif
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 57ab86d94d64..7cac79802ad2 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -185,6 +185,15 @@
#define MSG_REGULAR 1
#define MSG_RETRY 2
+#define BAU_DESC_QUALIFIER 0x534749
+
+enum uv_bau_version {
+ UV_BAU_V1 = 1,
+ UV_BAU_V2,
+ UV_BAU_V3,
+ UV_BAU_V4,
+};
+
/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
* If the 'multilevel' flag in the header portion of the descriptor
@@ -222,20 +231,32 @@ struct bau_local_cpumask {
* the s/w ack bit vector ]
*/
-/*
- * The payload is software-defined for INTD transactions
+/**
+ * struct uv1_2_3_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
*/
-struct bau_msg_payload {
- unsigned long address; /* signifies a page or all
- TLB's of the cpu */
- /* 64 bits */
- unsigned short sending_cpu; /* filled in by sender */
- /* 16 bits */
- unsigned short acknowledge_count; /* filled in by destination */
- /* 16 bits */
- unsigned int reserved1:32; /* not usable */
+struct uv1_2_3_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
};
+/**
+ * struct uv4_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
+ * @qualifier: Set by source to verify origin of INTD broadcast
+ */
+struct uv4_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
+ u32 reserved:8;
+ u32 qualifier:24;
+};
/*
* UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
@@ -385,17 +406,6 @@ struct uv2_3_bau_msg_header {
/* bits 127:120 */
};
-/* Abstracted BAU functions */
-struct bau_operations {
- unsigned long (*read_l_sw_ack)(void);
- unsigned long (*read_g_sw_ack)(int pnode);
- unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
- void (*write_l_sw_ack)(unsigned long mmr);
- void (*write_g_sw_ack)(int pnode, unsigned long mmr);
- void (*write_payload_first)(int pnode, unsigned long mmr);
- void (*write_payload_last)(int pnode, unsigned long mmr);
-};
-
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
@@ -411,7 +421,10 @@ struct bau_desc {
struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
- struct bau_msg_payload payload;
+ union bau_payload_header {
+ struct uv1_2_3_bau_msg_payload uv1_2_3;
+ struct uv4_bau_msg_payload uv4;
+ } payload;
};
/* UV1:
* -payload-- ---------header------
@@ -588,8 +601,12 @@ struct uvhub_desc {
struct socket_desc socket[2];
};
-/*
- * one per-cpu; to locate the software tables
+/**
+ * struct bau_control
+ * @status_mmr: location of status mmr, determined by uvhub_cpu
+ * @status_index: index of ERR|BUSY bits in status mmr, determined by uvhub_cpu
+ *
+ * Per-cpu control struct containing CPU topology information and BAU tuneables.
*/
struct bau_control {
struct bau_desc *descriptor_base;
@@ -607,6 +624,8 @@ struct bau_control {
int timeout_tries;
int ipi_attempts;
int conseccompletes;
+ u64 status_mmr;
+ int status_index;
bool nobau;
short baudisabled;
short cpu;
@@ -644,6 +663,19 @@ struct bau_control {
struct hub_and_pnode *thp;
};
+/* Abstracted BAU functions */
+struct bau_operations {
+ unsigned long (*read_l_sw_ack)(void);
+ unsigned long (*read_g_sw_ack)(int pnode);
+ unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
+ void (*write_l_sw_ack)(unsigned long mmr);
+ void (*write_g_sw_ack)(int pnode, unsigned long mmr);
+ void (*write_payload_first)(int pnode, unsigned long mmr);
+ void (*write_payload_last)(int pnode, unsigned long mmr);
+ int (*wait_completion)(struct bau_desc*,
+ struct bau_control*, long try);
+};
+
static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 2444189cbe28..bccdf4938ddf 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -20,6 +20,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
long sym_pvclock_page;
+ long sym_hvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 84c00592d359..4b994232cb57 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -27,7 +27,7 @@ KASAN_SANITIZE_stacktrace.o := n
OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
-OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
@@ -46,7 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
-obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
@@ -82,6 +82,7 @@ obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 294eab07c209..6bb680671088 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1565,12 +1565,6 @@ int __init early_acpi_boot_init(void)
return 0;
}
-static int __init acpi_parse_bgrt(struct acpi_table_header *table)
-{
- efi_bgrt_init(table);
- return 0;
-}
-
int __init acpi_boot_init(void)
{
/* those are executed after early-quirks are executed */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0b657bf3d350..2d75faf743f2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void)
TICK_NSEC, lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFF;
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
return 0;
}
@@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
@@ -1789,8 +1793,8 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
/*
- * acpi lapic path already maps that address in
- * acpi_register_lapic_address()
+ * If the system has ACPI MADT tables or MP info, the LAPIC
+ * address is already registered.
*/
if (!acpi_lapic && !smp_found_config)
register_lapic_address(apic_phys);
@@ -2237,7 +2241,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
static void __init apic_bsp_up_setup(void)
{
#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+ apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
#else
/*
* Hack: In case of kdump, after a crash, kernel might be booting
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5a533fefefa0..5b366462f579 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -32,55 +32,98 @@
#include <asm/intel-family.h>
#include <asm/intel_rdt.h>
+#define MAX_MBA_BW 100u
+#define MBA_IS_LINEAR 0x4
+
/* Mutex to protect rdtgroup access. */
DEFINE_MUTEX(rdtgroup_mutex);
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * Used to store the max resource name width and max resource data width
+ * to display the schemata in a tabular format
+ */
+int max_name_width, max_data_width;
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
struct rdt_resource rdt_resources_all[] = {
{
- .name = "L3",
- .domains = domain_init(RDT_RESOURCE_L3),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 1,
- .cbm_idx_offset = 0
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ },
+ {
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L3DATA",
- .domains = domain_init(RDT_RESOURCE_L3DATA),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 2,
- .cbm_idx_offset = 0
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L3CODE",
- .domains = domain_init(RDT_RESOURCE_L3CODE),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 2,
- .cbm_idx_offset = 1
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L2",
- .domains = domain_init(RDT_RESOURCE_L2),
- .msr_base = IA32_L2_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 2,
- .cbm_idx_multi = 1,
- .cbm_idx_offset = 0
+ .name = "MB",
+ .domains = domain_init(RDT_RESOURCE_MBA),
+ .msr_base = IA32_MBA_THRTL_BASE,
+ .msr_update = mba_wrmsr,
+ .cache_level = 3,
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*d",
},
};
-static int cbm_idx(struct rdt_resource *r, int closid)
+static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
{
- return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+ return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
}
/*
@@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void)
return false;
r->num_closid = 4;
- r->cbm_len = 20;
- r->max_cbm = max_cbm;
- r->min_cbm_bits = 2;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.min_cbm_bits = 2;
r->capable = true;
r->enabled = true;
@@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void)
return false;
}
-static void rdt_get_config(int idx, struct rdt_resource *r)
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static bool rdt_get_mem_config(struct rdt_resource *r)
+{
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->membw.max_delay = eax.split.max_delay + 1;
+ r->default_ctrl = MAX_MBA_BW;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ }
+ r->data_width = 3;
+ rdt_get_mba_infofile(r);
+
+ r->capable = true;
+ r->enabled = true;
+
+ return true;
+}
+
+static void rdt_get_cache_config(int idx, struct rdt_resource *r)
{
union cpuid_0x10_1_eax eax;
- union cpuid_0x10_1_edx edx;
+ union cpuid_0x10_x_edx edx;
u32 ebx, ecx;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
r->num_closid = edx.split.cos_max + 1;
- r->cbm_len = eax.split.cbm_len + 1;
- r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ rdt_get_cache_infofile(r);
r->capable = true;
r->enabled = true;
}
@@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type)
struct rdt_resource *r = &rdt_resources_all[type];
r->num_closid = r_l3->num_closid / 2;
- r->cbm_len = r_l3->cbm_len;
- r->max_cbm = r_l3->max_cbm;
+ r->cache.cbm_len = r_l3->cache.cbm_len;
+ r->default_ctrl = r_l3->default_ctrl;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
r->capable = true;
/*
* By default, CDP is disabled. CDP can be enabled by mount parameter
@@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type)
r->enabled = false;
}
-static inline bool get_rdt_resources(void)
-{
- bool ret = false;
-
- if (cache_alloc_hsw_probe())
- return true;
-
- if (!boot_cpu_has(X86_FEATURE_RDT_A))
- return false;
-
- if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
- }
- ret = true;
- }
- if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
- /* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
- ret = true;
- }
-
- return ret;
-}
-
static int get_cache_id(int cpu, int level)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
@@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level)
return -1;
}
-void rdt_cbm_update(void *arg)
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
{
- struct msr_param *m = (struct msr_param *)arg;
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return r->default_ctrl;
+}
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+}
+
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct msr_param *m = arg;
struct rdt_resource *r = m->res;
- int i, cpu = smp_processor_id();
+ int cpu = smp_processor_id();
struct rdt_domain *d;
list_for_each_entry(d, &r->domains, list) {
/* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
- goto found;
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ r->msr_update(d, m, r);
+ return;
+ }
}
- pr_info_once("cpu %d not found in any domain for resource %s\n",
+ pr_warn_once("cpu %d not found in any domain for resource %s\n",
cpu, r->name);
-
- return;
-
-found:
- for (i = m->low; i < m->high; i++) {
- int idx = cbm_idx(r, i);
-
- wrmsrl(r->msr_base + idx, d->cbm[i]);
- }
}
/*
@@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
return NULL;
}
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct msr_param m;
+ u32 *dc;
+ int i;
+
+ dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ d->ctrl_val = dc;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100
+ */
+ for (i = 0; i < r->num_closid; i++, dc++)
+ *dc = r->default_ctrl;
+
+ m.low = 0;
+ m.high = r->num_closid;
+ r->msr_update(d, &m, r);
+ return 0;
+}
+
/*
* domain_add_cpu - Add a cpu to a resource's domain list.
*
@@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
*/
static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int i, id = get_cache_id(cpu, r->cache_level);
+ int id = get_cache_id(cpu, r->cache_level);
struct list_head *add_pos = NULL;
struct rdt_domain *d;
@@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
d->id = id;
- d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
- if (!d->cbm) {
+ if (domain_setup_ctrlval(r, d)) {
kfree(d);
return;
}
- for (i = 0; i < r->num_closid; i++) {
- int idx = cbm_idx(r, i);
-
- d->cbm[i] = r->max_cbm;
- wrmsrl(r->msr_base + idx, d->cbm[i]);
- }
-
cpumask_set_cpu(cpu, &d->cpu_mask);
list_add_tail(&d->list, add_pos);
- r->num_domains++;
}
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
- r->num_domains--;
- kfree(d->cbm);
+ kfree(d->ctrl_val);
list_del(&d->list);
kfree(d);
}
@@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
return 0;
}
+/*
+ * Choose a width for the resource name and resource data based on the
+ * resource that has widest name and cbm.
+ */
+static __init void rdt_init_padding(void)
+{
+ struct rdt_resource *r;
+ int cl;
+
+ for_each_capable_rdt_resource(r) {
+ cl = strlen(r->name);
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ if (r->data_width > max_data_width)
+ max_data_width = r->data_width;
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ bool ret = false;
+
+ if (cache_alloc_hsw_probe())
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+ }
+ ret = true;
+ }
+ if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ ret = true;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ ret = true;
+ }
+
+ return ret;
+}
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
+ rdt_init_padding();
+
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/rdt/cat:online:",
intel_rdt_online_cpu, intel_rdt_offline_cpu);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9ac2a5cdd9c2..f5af0cc7eb0d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -174,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
.seq_show = rdtgroup_seqfile_show,
};
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+ struct rftype *rft = of->kn->priv;
+
+ return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
static int rdtgroup_cpus_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
@@ -182,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
- else
+ if (rdtgrp) {
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
+ } else {
ret = -ENOENT;
+ }
rdtgroup_kn_unlock(of->kn);
return ret;
@@ -252,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
- ret = cpumask_parse(buf, newmask);
+ if (is_cpu_list(of))
+ ret = cpulist_parse(buf, newmask);
+ else
+ ret = cpumask_parse(buf, newmask);
+
if (ret)
goto unlock;
@@ -473,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = {
.seq_show = rdtgroup_cpus_show,
},
{
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ },
+ {
.name = "tasks",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -494,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
struct rdt_resource *r = of->kn->parent->priv;
seq_printf(seq, "%d\n", r->num_closid);
+ return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%x\n", r->default_ctrl);
return 0;
}
-static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
- seq_printf(seq, "%x\n", r->max_cbm);
+ seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+ return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%u\n", r->membw.min_bw);
return 0;
}
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
- seq_printf(seq, "%d\n", r->min_cbm_bits);
+ seq_printf(seq, "%u\n", r->membw.bw_gran);
+ return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%u\n", r->membw.delay_linear);
return 0;
}
/* rdtgroup information files for one cache resource. */
-static struct rftype res_info_files[] = {
+static struct rftype res_cache_info_files[] = {
{
.name = "num_closids",
.mode = 0444,
@@ -530,7 +575,7 @@ static struct rftype res_info_files[] = {
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_cbm_mask_show,
+ .seq_show = rdt_default_ctrl_show,
},
{
.name = "min_cbm_bits",
@@ -540,11 +585,52 @@ static struct rftype res_info_files[] = {
},
};
+/* rdtgroup information files for memory bandwidth. */
+static struct rftype res_mba_info_files[] = {
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ },
+ {
+ .name = "min_bandwidth",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_bw_show,
+ },
+ {
+ .name = "bandwidth_gran",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bw_gran_show,
+ },
+ {
+ .name = "delay_linear",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_delay_linear_show,
+ },
+};
+
+void rdt_get_mba_infofile(struct rdt_resource *r)
+{
+ r->info_files = res_mba_info_files;
+ r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+}
+
+void rdt_get_cache_infofile(struct rdt_resource *r)
+{
+ r->info_files = res_cache_info_files;
+ r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+}
+
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
struct kernfs_node *kn_subdir;
+ struct rftype *res_info_files;
struct rdt_resource *r;
- int ret;
+ int ret, len;
/* create the directory */
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -563,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
goto out_destroy;
- ret = rdtgroup_add_files(kn_subdir, res_info_files,
- ARRAY_SIZE(res_info_files));
+
+ res_info_files = r->info_files;
+ len = r->nr_info_files;
+
+ ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
if (ret)
goto out_destroy;
kernfs_activate(kn_subdir);
@@ -780,7 +869,7 @@ out:
return dentry;
}
-static int reset_all_cbms(struct rdt_resource *r)
+static int reset_all_ctrls(struct rdt_resource *r)
{
struct msr_param msr_param;
cpumask_var_t cpu_mask;
@@ -803,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r)
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
for (i = 0; i < r->num_closid; i++)
- d->cbm[i] = r->max_cbm;
+ d->ctrl_val[i] = r->default_ctrl;
}
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_cbm_update(&msr_param);
+ rdt_ctrl_update(&msr_param);
/* Update CBM on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
put_cpu();
free_cpumask_var(cpu_mask);
@@ -896,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb)
/*Put everything back to default values. */
for_each_enabled_rdt_resource(r)
- reset_all_cbms(r);
+ reset_all_ctrls(r);
cdp_disable();
rmdir_all_sub();
static_branch_disable(&rdt_enable_key);
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
index f369cb8db0d5..406d7a6532f9 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -29,26 +29,77 @@
#include <asm/intel_rdt.h>
/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+ unsigned long bw;
+ int ret;
+
+ /*
+ * Only linear delay values is supported for current Intel SKUs.
+ */
+ if (!r->membw.delay_linear)
+ return false;
+
+ ret = kstrtoul(buf, 10, &bw);
+ if (ret)
+ return false;
+
+ if (bw < r->membw.min_bw || bw > r->default_ctrl)
+ return false;
+
+ *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+ return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+ unsigned long data;
+
+ if (d->have_new_ctrl)
+ return -EINVAL;
+
+ if (!bw_validate(buf, &data, r))
+ return -EINVAL;
+ d->new_ctrl = data;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
* Check whether a cache bit mask is valid. The SDM says:
* Please note that all (and only) contiguous '1' combinations
* are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
* Additionally Haswell requires at least two bits set.
*/
-static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
{
- unsigned long first_bit, zero_bit;
+ unsigned long first_bit, zero_bit, val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret)
+ return false;
- if (var == 0 || var > r->max_cbm)
+ if (val == 0 || val > r->default_ctrl)
return false;
- first_bit = find_first_bit(&var, r->cbm_len);
- zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+ if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
return false;
- if ((zero_bit - first_bit) < r->min_cbm_bits)
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
return false;
+
+ *data = val;
return true;
}
@@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-static int parse_cbm(char *buf, struct rdt_resource *r)
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
- int ret;
- ret = kstrtoul(buf, 16, &data);
- if (ret)
- return ret;
- if (!cbm_validate(data, r))
+ if (d->have_new_ctrl)
return -EINVAL;
- r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+ if(!cbm_validate(buf, &data, r))
+ return -EINVAL;
+ d->new_ctrl = data;
+ d->have_new_ctrl = true;
return 0;
}
@@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r)
/*
* For each domain in this resource we expect to find a series of:
* id=mask
- * separated by ";". The "id" is in decimal, and must appear in the
- * right order.
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
*/
static int parse_line(char *line, struct rdt_resource *r)
{
@@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r)
struct rdt_domain *d;
unsigned long dom_id;
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ if (!dom || kstrtoul(id, 10, &dom_id))
+ return -EINVAL;
+ dom = strim(dom);
list_for_each_entry(d, &r->domains, list) {
- dom = strsep(&line, ";");
- if (!dom)
- return -EINVAL;
- id = strsep(&dom, "=");
- if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
- return -EINVAL;
- if (parse_cbm(dom, r))
- return -EINVAL;
+ if (d->id == dom_id) {
+ if (r->parse_ctrlval(dom, r, d))
+ return -EINVAL;
+ goto next;
+ }
}
-
- /* Any garbage at the end of the line? */
- if (line && line[0])
- return -EINVAL;
- return 0;
+ return -EINVAL;
}
static int update_domains(struct rdt_resource *r, int closid)
@@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid)
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int cpu, idx = 0;
+ int cpu;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid)
msr_param.res = r;
list_for_each_entry(d, &r->domains, list) {
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
- d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+ if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ d->ctrl_val[closid] = d->new_ctrl;
+ }
}
+ if (cpumask_empty(cpu_mask))
+ goto done;
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_cbm_update(&msr_param);
+ rdt_ctrl_update(&msr_param);
/* Update CBM on other cpus. */
- smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
put_cpu();
+done:
free_cpumask_var(cpu_mask);
return 0;
}
+static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+{
+ struct rdt_resource *r;
+
+ for_each_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) && closid < r->num_closid)
+ return parse_line(tok, r);
+ }
+ return -EINVAL;
+}
+
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct rdtgroup *rdtgrp;
+ struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
int closid, ret = 0;
- u32 *l3_cbms = NULL;
/* Valid input requires a trailing newline */
if (nbytes == 0 || buf[nbytes - 1] != '\n')
@@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
closid = rdtgrp->closid;
- /* get scratch space to save all the masks while we validate input */
for_each_enabled_rdt_resource(r) {
- r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
- GFP_KERNEL);
- if (!r->tmp_cbms) {
- ret = -ENOMEM;
- goto out;
- }
- r->num_tmp_cbms = 0;
+ list_for_each_entry(dom, &r->domains, list)
+ dom->have_new_ctrl = false;
}
while ((tok = strsep(&buf, "\n")) != NULL) {
- resname = strsep(&tok, ":");
+ resname = strim(strsep(&tok, ":"));
if (!tok) {
ret = -EINVAL;
goto out;
}
- for_each_enabled_rdt_resource(r) {
- if (!strcmp(resname, r->name) &&
- closid < r->num_closid) {
- ret = parse_line(tok, r);
- if (ret)
- goto out;
- break;
- }
- }
- if (!r->name) {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Did the parser find all the masks we need? */
- for_each_enabled_rdt_resource(r) {
- if (r->num_tmp_cbms != r->num_domains) {
- ret = -EINVAL;
+ ret = rdtgroup_parse_resource(resname, tok, closid);
+ if (ret)
goto out;
- }
}
for_each_enabled_rdt_resource(r) {
@@ -201,10 +245,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
out:
rdtgroup_kn_unlock(of->kn);
- for_each_enabled_rdt_resource(r) {
- kfree(r->tmp_cbms);
- r->tmp_cbms = NULL;
- }
return ret ?: nbytes;
}
@@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
struct rdt_domain *dom;
bool sep = false;
- seq_printf(s, "%s:", r->name);
+ seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
seq_puts(s, ";");
- seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+ seq_printf(s, r->format_str, dom->id, max_data_width,
+ dom->ctrl_val[closid]);
sep = true;
}
seq_puts(s, "\n");
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..43051f0777d4 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
+ smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int next, entry;
+
+ wmb();
+ for (;;) {
+ entry = mce_log_get_idx_check(mcelog.next);
+ for (;;) {
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return NOTIFY_OK;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ next = mce_log_get_idx_check(mcelog.next);
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(m, 0, sizeof(*m));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
+ smp_rmb();
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
+ }
+ }
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ return err;
+ }
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 1e5a50c11d3c..217cd4449bc9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
head = llist_reverse_order(head);
llist_for_each_entry_safe(node, tmp, head, llnode) {
mce = &node->mce;
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 903043e6a62b..654ad0668d72 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,7 +13,7 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
m1->addr != m2->addr ||
m1->misc != m2->misc;
}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void) { }
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8e9725c607ea..5abd4bf73d6e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
+#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -49,18 +50,11 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/reboot.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-#define mce_log_get_idx_check(p) \
-({ \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
- !lockdep_is_held(&mce_chrdev_read_mutex), \
- "suspicious mce_log_get_idx_check() usage"); \
- smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -121,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
@@ -143,82 +131,38 @@ void mce_setup(struct mce *m)
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
+void mce_log(struct mce *m)
{
- unsigned next, entry;
-
- /* Emit the trace record: */
- trace_mce_record(mce);
-
- if (!mce_gen_pool_add(mce))
+ if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
-
- wmb();
- for (;;) {
- entry = mce_log_get_idx_check(mcelog.next);
- for (;;) {
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- set_bit(0, &mce_need_notify);
}
void mce_inject_log(struct mce *m)
{
- mutex_lock(&mce_chrdev_read_mutex);
+ mutex_lock(&mce_log_mutex);
mce_log(m);
- mutex_unlock(&mce_chrdev_read_mutex);
+ mutex_unlock(&mce_log_mutex);
}
EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers;
void mce_register_decode_chain(struct notifier_block *nb)
{
- atomic_inc(&num_notifiers);
+ if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ return;
- WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
+ atomic_inc(&num_notifiers);
- atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
@@ -226,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
{
atomic_dec(&num_notifiers);
- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -319,18 +263,7 @@ static void __print_mce(struct mce *m)
static void print_mce(struct mce *m)
{
- int ret = 0;
-
__print_mce(m);
-
- /*
- * Print out human-readable details about the MCE error,
- * (if the CPU has an implementation for that)
- */
- ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- if (ret == NOTIFY_STOP)
- return;
-
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
@@ -519,7 +452,6 @@ static void mce_schedule_work(void)
static void mce_irq_work_cb(struct irq_work *entry)
{
- mce_notify_irq();
mce_schedule_work();
}
@@ -548,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs)
*/
static int mce_usable_address(struct mce *m)
{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ if (!(m->status & MCI_STATUS_ADDRV))
return 0;
/* Checks after this one are Intel-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 1;
+ if (!(m->status & MCI_STATUS_MISCV))
+ return 0;
+
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
+
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
+
return 1;
}
+static bool memory_error(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
+ } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+ if (!m)
+ return false;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (memory_error(m) &&
+ !(m->status & MCI_STATUS_UC) &&
+ mce_usable_address(m))
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+ return true;
+
+ return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (cec_add_mce(m))
+ return NOTIFY_STOP;
+
+ /* Emit the trace record: */
+ trace_mce_record(m);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+ .notifier_call = mce_first_notifier,
+ .priority = MCE_PRIO_FIRST,
+};
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -591,11 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
- /*
- * Run the default notifier if we have only the SRAO
- * notifier and us registered.
- */
- if (atomic_read(&num_notifiers) > 2)
+ if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
return NOTIFY_DONE;
__print_mce(m);
@@ -648,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i)
}
}
-static bool memory_error(struct mce *m)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
-
- return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
- *
- * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
- * indicating a memory error. Bit 8 is used for indicating a
- * cache hierarchy error. The combination of bit 2 and bit 3
- * is used for indicating a `generic' cache hierarchy error
- * But we can't just blindly check the above bits, because if
- * bit 11 is set, then it is a bus/interconnect error - and
- * either way the above bits just gives more detail on what
- * bus/interconnect error happened. Note that bit 12 can be
- * ignored, as it's the "filter" bit.
- */
- return (m->status & 0xef80) == BIT(7) ||
- (m->status & 0xef00) == BIT(8) ||
- (m->status & 0xeffc) == 0xc;
- }
-
- return false;
-}
-
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -1127,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* on Intel.
*/
int lmce = 1;
+ int cpu = smp_processor_id();
- /* If this CPU is offline, just bail out. */
- if (cpu_is_offline(smp_processor_id())) {
+ /*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -1399,13 +1386,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu));
}
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
/*
* Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI
@@ -1417,11 +1397,7 @@ int mce_notify_irq(void)
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
if (test_and_clear_bit(0, &mce_need_notify)) {
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- if (mce_helper[0])
- schedule_work(&mce_trigger_work);
+ mce_work_trigger();
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
@@ -1688,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
return 0;
}
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
- break;
-
- case X86_VENDOR_AMD: {
+ if (c->x86_vendor == X86_VENDOR_AMD) {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
- /*
- * Install proper ops for Scalable MCA enabled processors
- */
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
msr_ops.status = smca_status_reg;
msr_ops.addr = smca_addr_reg;
msr_ops.misc = smca_misc_reg;
}
- mce_amd_feature_init(c);
+ }
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ mce_adjust_timer = cmci_intel_adjust_timer;
+ break;
+ case X86_VENDOR_AMD: {
+ mce_amd_feature_init(c);
break;
}
@@ -1798,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
machine_check_vector = do_machine_check;
+ __mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
@@ -1823,252 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
}
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count; /* #times opened */
-static int mce_chrdev_open_exclu; /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- if (mce_chrdev_open_exclu ||
- (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_chrdev_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- mce_chrdev_open_exclu = 1;
- mce_chrdev_open_count++;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- mce_chrdev_open_count--;
- mce_chrdev_open_exclu = 0;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- /*
- * When ERST is disabled, mce_chrdev_read() should return
- * "no record" instead of "no device."
- */
- if (rc == -ENODEV)
- return 0;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
- size_t usize, loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_chrdev_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = mce_log_get_idx_check(mcelog.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- struct mce *m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(m, 0, sizeof(*m));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, m, sizeof(*m));
- buf += sizeof(*m);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
-
- if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
- err |= copy_to_user(buf, m, sizeof(*m));
- smp_rmb();
- buf += sizeof(*m);
- memset(m, 0, sizeof(*m));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_chrdev_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
- unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
-{
- mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
- return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_chrdev_open,
- .release = mce_chrdev_release,
- .read = mce_chrdev_read,
- .write = mce_chrdev_write,
- .poll = mce_chrdev_poll,
- .unlocked_ioctl = mce_chrdev_ioctl,
- .llseek = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
@@ -2142,6 +1878,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2286,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return size;
}
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
static ssize_t set_ignore_ce(struct device *s,
struct device_attribute *attr,
const char *buf, size_t size)
@@ -2365,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret;
}
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2388,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
&dev_attr_trigger,
+#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
@@ -2562,7 +2277,6 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
- enum cpuhp_state hp_online;
int err;
if (!mce_available(&boot_cpu_data)) {
@@ -2590,21 +2304,11 @@ static __init int mcheck_init_device(void)
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
goto err_out_online;
- hp_online = err;
register_syscore_ops(&mce_syscore_ops);
- /* register character device /dev/mcelog */
- err = misc_register(&mce_chrdev_device);
- if (err)
- goto err_register;
-
return 0;
-err_register:
- unregister_syscore_ops(&mce_syscore_ops);
- cpuhp_remove_state(hp_online);
-
err_out_online:
cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
@@ -2612,7 +2316,7 @@ err_out_mem:
free_cpumask_var(mce_device_initialized);
err_out:
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
return err;
}
@@ -2691,6 +2395,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
+ cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 190b3e6cef4d..e84db79ef272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_XEON_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+
if (rdmsrl_safe(MSR_PPIN_CTL, &val))
return;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d9794060fe22..23c23508c012 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 09d4ac0d2661..dbce3cca94cb 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -77,7 +77,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - softirq stack
* - hardirq stack
*/
- for (regs = NULL; stack; stack = stack_info.next_sp) {
+ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
/*
@@ -289,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
if (__die(str, regs, err))
sig = 0;
oops_end(flags, regs, sig);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b0b3a3df7c20..e5f0b40e66d2 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -162,15 +162,3 @@ void show_regs(struct pt_regs *regs)
}
pr_cont("\n");
}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (ip < PAGE_OFFSET)
- return 0;
- if (probe_kernel_address((unsigned short *)ip, ud2))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a8b117e93b46..3e1471d57487 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -178,13 +178,3 @@ void show_regs(struct pt_regs *regs)
}
pr_cont("\n");
}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 8a121991e5ba..0f0840304452 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -17,6 +17,7 @@
#include <asm/intel-mid.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
#include <linux/efi.h>
#include <asm/efi.h>
#include <asm/pci_x86.h>
@@ -381,6 +382,10 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "efi", 3))
early_console_register(&early_efi_console, keep);
#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4);
+#endif
buf++;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cbd73eb42170..5b7153540727 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,12 +29,6 @@
#include <asm/ftrace.h>
#include <asm/nops.h>
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && \
- !defined(CC_USING_FENTRY) && \
- !defined(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE)
-# error The following combination is not supported: ((compiler missing -mfentry) || (CONFIG_X86_32 and !CONFIG_DYNAMIC_FTRACE)) && CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_CC_OPTIMIZE_FOR_SIZE
-#endif
-
#ifdef CONFIG_DYNAMIC_FTRACE
int ftrace_arch_code_modify_prepare(void)
@@ -989,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ /*
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
+ */
+ if (unlikely((long)__builtin_frame_address(0) >= 0))
+ return;
+
if (unlikely(ftrace_graph_is_dead()))
return;
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 000000000000..722a145b4139
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,244 @@
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/export.h>
+#include <asm/ftrace.h>
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */
+#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER)
+# define USING_FRAME_POINTER
+#endif
+
+#ifdef USING_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+ENTRY(function_hook)
+ ret
+END(function_hook)
+
+ENTRY(ftrace_caller)
+
+#ifdef USING_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+# endif
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef USING_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef USING_FRAME_POINTER
+ popl %ebp
+# ifdef CC_USING_FENTRY
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+# endif
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * regs->ip location, and move flags into the return ip location.
+ */
+ pushl $__KERNEL_CS
+ pushl 4(%esp) /* Save the return ip */
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+
+ /* Get flags and place them into the return ip slot */
+ pushf
+ popl %eax
+ movl %eax, 8*4(%esp)
+
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+#ifdef CC_USING_FENTRY
+ movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
+#else
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+#endif
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+
+ /* restore flags */
+ push 14*4(%esp)
+ popf
+
+ /* Move return ip back to its original location */
+ movl 12*4(%esp), %eax
+ movl %eax, 14*4(%esp)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+
+ /* use lea to not affect flags */
+ lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
+
+ jmp .Lftrace_ret
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz .Ltrace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+.Ltrace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+#ifdef CC_USING_FENTRY
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+#else
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+#endif
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+#ifdef CC_USING_FENTRY
+ movl $0, %eax
+#else
+ movl %ebp, %eax
+#endif
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ jmp *%ecx
+#endif
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/ftrace_64.S
index 7b0d3da52fb4..1dfac634bbf7 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -1,6 +1,4 @@
/*
- * linux/arch/x86_64/mcount_64.S
- *
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
@@ -13,9 +11,6 @@
.code64
.section .entry.text, "ax"
-
-#ifdef CONFIG_FUNCTION_TRACER
-
#ifdef CC_USING_FENTRY
# define function_hook __fentry__
EXPORT_SYMBOL(__fentry__)
@@ -297,7 +292,6 @@ trace:
jmp fgraph_trace
END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4d8183b5f113..f34fe7444836 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -394,6 +394,9 @@ int check_irq_vectors_for_cpu_disable(void)
!cpumask_subset(&affinity_new, &online_new))
this_count++;
}
+ /* No need to check any further. */
+ if (!this_count)
+ return 0;
count = 0;
for_each_online_cpu(cpu) {
@@ -411,8 +414,10 @@ int check_irq_vectors_for_cpu_disable(void)
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
if (!test_bit(vector, used_vectors) &&
- IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
- count++;
+ IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
+ if (++count == this_count)
+ return 0;
+ }
}
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..7468c6987547 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -195,7 +195,5 @@ void __init native_init_IRQ(void)
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
-#ifdef CONFIG_X86_32
irq_ctx_init(smp_processor_id());
-#endif
}
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index d688826e5736..db2182d63ed0 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -67,7 +67,7 @@
#endif
/* Ensure if the instruction can be boostable */
-extern int can_boost(kprobe_opcode_t *instruction, void *addr);
+extern int can_boost(struct insn *insn, void *orig_addr);
/* Recover instruction if given address is probed */
extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
unsigned long addr);
@@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
* Copy an instruction and adjust the displacement if the instruction
* uses the %rip-relative addressing mode.
*/
-extern int __copy_instruction(u8 *dest, u8 *src);
+extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn);
/* Generate a relative-jump/call instruction */
extern void synthesize_reljump(void *from, void *to);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 993fa4fe4f68..19e1f2a6d7b0 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -164,42 +164,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
NOKPROBE_SYMBOL(skip_prefixes);
/*
- * Returns non-zero if opcode is boostable.
+ * Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int can_boost(kprobe_opcode_t *opcodes, void *addr)
+int can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
- kprobe_opcode_t *orig_opcodes = opcodes;
if (search_exception_tables((unsigned long)addr))
return 0; /* Page fault may occur on this address. */
-retry:
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- opcode = *(opcodes++);
-
/* 2nd-byte opcode */
- if (opcode == 0x0f) {
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- return test_bit(*opcodes,
+ if (insn->opcode.nbytes == 2)
+ return test_bit(insn->opcode.bytes[1],
(unsigned long *)twobyte_is_boostable);
- }
+
+ if (insn->opcode.nbytes != 1)
+ return 0;
+
+ /* Can't boost Address-size override prefix */
+ if (unlikely(inat_is_address_size_prefix(insn->attr)))
+ return 0;
+
+ opcode = insn->opcode.bytes[0];
switch (opcode & 0xf0) {
-#ifdef CONFIG_X86_64
- case 0x40:
- goto retry; /* REX prefix is boostable */
-#endif
case 0x60:
- if (0x63 < opcode && opcode < 0x67)
- goto retry; /* prefixes */
- /* can't boost Address-size override and bound */
- return (opcode != 0x62 && opcode != 0x67);
+ /* can't boost "bound" */
+ return (opcode != 0x62);
case 0x70:
return 0; /* can't boost conditional jump */
+ case 0x90:
+ return opcode != 0x9a; /* can't boost call far */
case 0xc0:
/* can't boost software-interruptions */
return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
@@ -210,14 +206,9 @@ retry:
/* can boost in/out and absolute jmps */
return ((opcode & 0x04) || opcode == 0xea);
case 0xf0:
- if ((opcode & 0x0c) == 0 && opcode != 0xf1)
- goto retry; /* lock/rep(ne) prefix */
/* clear and set flags are boostable */
return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
default:
- /* segment override prefixes are boostable */
- if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
- goto retry; /* prefixes */
/* CS override prefix and call are not boostable */
return (opcode != 0x2e && opcode != 0x9a);
}
@@ -264,7 +255,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
* Fortunately, we know that the original code is the ideal 5-byte
* long NOP.
*/
- memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
if (faddr)
memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
else
@@ -276,7 +270,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
* Recover the probed instruction at addr for further analysis.
* Caller must lock kprobes by kprobe_mutex, or disable preemption
* for preventing to release referencing kprobes.
- * Returns zero if the instruction can not get recovered.
+ * Returns zero if the instruction can not get recovered (or access failed).
*/
unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
{
@@ -348,37 +342,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn)
}
/*
- * Copy an instruction and adjust the displacement if the instruction
- * uses the %rip-relative addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- * Only applicable to 64-bit x86.
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
+ * This returns the length of copied instruction, or 0 if it has an error.
*/
-int __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
{
- struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- int length;
unsigned long recovered_insn =
recover_probed_instruction(buf, (unsigned long)src);
- if (!recovered_insn)
+ if (!recovered_insn || !insn)
+ return 0;
+
+ /* This can access kernel text if given address is not recovered */
+ if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
return 0;
- kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
- insn_get_length(&insn);
- length = insn.length;
+
+ kernel_insn_init(insn, dest, MAX_INSN_SIZE);
+ insn_get_length(insn);
/* Another subsystem puts a breakpoint, failed to recover */
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
- memcpy(dest, insn.kaddr, length);
#ifdef CONFIG_X86_64
- if (insn_rip_relative(&insn)) {
+ /* Only x86_64 has RIP relative instructions */
+ if (insn_rip_relative(insn)) {
s64 newdisp;
u8 *disp;
- kernel_insn_init(&insn, dest, length);
- insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
* mode. Adjust the displacement for the difference between
@@ -391,36 +384,57 @@ int __copy_instruction(u8 *dest, u8 *src)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
+ newdisp = (u8 *) src + (s64) insn->displacement.value
+ - (u8 *) dest;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
- pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
+ pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
+ src, dest, insn->displacement.value);
return 0;
}
- disp = (u8 *) dest + insn_offset_displacement(&insn);
+ disp = (u8 *) dest + insn_offset_displacement(insn);
*(s32 *) disp = (s32) newdisp;
}
#endif
- return length;
+ return insn->length;
+}
+
+/* Prepare reljump right after instruction to boost */
+static void prepare_boost(struct kprobe *p, struct insn *insn)
+{
+ if (can_boost(insn, p->addr) &&
+ MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) {
+ /*
+ * These instructions can be executed directly if it
+ * jumps back to correct address.
+ */
+ synthesize_reljump(p->ainsn.insn + insn->length,
+ p->addr + insn->length);
+ p->ainsn.boostable = true;
+ } else {
+ p->ainsn.boostable = false;
+ }
}
static int arch_copy_kprobe(struct kprobe *p)
{
- int ret;
+ struct insn insn;
+ int len;
+
+ set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
/* Copy an instruction with recovering if other optprobe modifies it.*/
- ret = __copy_instruction(p->ainsn.insn, p->addr);
- if (!ret)
+ len = __copy_instruction(p->ainsn.insn, p->addr, &insn);
+ if (!len)
return -EINVAL;
/*
* __copy_instruction can modify the displacement of the instruction,
* but it doesn't affect boostable check.
*/
- if (can_boost(p->ainsn.insn, p->addr))
- p->ainsn.boostable = 0;
- else
- p->ainsn.boostable = -1;
+ prepare_boost(p, &insn);
+
+ set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
/* Check whether the instruction modifies Interrupt Flag or not */
p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
@@ -459,7 +473,7 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
- free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+ free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}
}
@@ -531,7 +545,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
return;
#if !defined(CONFIG_PREEMPT)
- if (p->ainsn.boostable == 1 && !p->post_handler) {
+ if (p->ainsn.boostable && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
if (!reenter)
reset_current_kprobe();
@@ -851,7 +865,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
case 0xcf:
case 0xea: /* jmp absolute -- ip is correct */
/* ip is already adjusted, no more changes required */
- p->ainsn.boostable = 1;
+ p->ainsn.boostable = true;
goto no_change;
case 0xe8: /* call relative - Fix return addr */
*tos = orig_ip + (*tos - copy_ip);
@@ -876,28 +890,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
* jmp near and far, absolute indirect
* ip is correct. And this is boostable
*/
- p->ainsn.boostable = 1;
+ p->ainsn.boostable = true;
goto no_change;
}
default:
break;
}
- if (p->ainsn.boostable == 0) {
- if ((regs->ip > copy_ip) &&
- (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
- /*
- * These instructions can be executed directly if it
- * jumps back to correct address.
- */
- synthesize_reljump((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
- p->ainsn.boostable = 1;
- } else {
- p->ainsn.boostable = -1;
- }
- }
-
regs->ip += orig_ip - copy_ip;
no_change:
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 5f8f0b3cc674..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler);
int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
p->ainsn.insn = NULL;
- p->ainsn.boostable = -1;
+ p->ainsn.boostable = false;
return 0;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 3e7c6e5a08ff..9aadff3d0902 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -65,7 +65,10 @@ found:
* overwritten by jump destination address. In this case, original
* bytes must be recovered from op->optinsn.copied_insn buffer.
*/
- memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
if (addr == (unsigned long)kp->addr) {
buf[0] = kp->opcode;
memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
@@ -174,11 +177,12 @@ NOKPROBE_SYMBOL(optimized_callback);
static int copy_optimized_instructions(u8 *dest, u8 *src)
{
+ struct insn insn;
int len = 0, ret;
while (len < RELATIVEJUMP_SIZE) {
- ret = __copy_instruction(dest + len, src + len);
- if (!ret || !can_boost(dest + len, src + len))
+ ret = __copy_instruction(dest + len, src + len, &insn);
+ if (!ret || !can_boost(&insn, src + len))
return -EINVAL;
len += ret;
}
@@ -350,6 +354,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
}
buf = (u8 *)op->optinsn.insn;
+ set_memory_rw((unsigned long)buf & PAGE_MASK, 1);
/* Copy instructions into the out-of-line buffer */
ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
@@ -372,6 +377,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
(u8 *)op->kp.addr + op->optinsn.size);
+ set_memory_ro((unsigned long)buf & PAGE_MASK, 1);
+
flush_icache_range((unsigned long) buf,
(unsigned long) buf + TMPL_END_IDX +
op->optinsn.size + RELATIVEJUMP_SIZE);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a723ae9440ab..446c8aa09b9b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -222,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- /*
- * On some machines, PCI SERR line is used to report memory
- * errors. EDAC makes use of it.
- */
-#if defined(CONFIG_EDAC)
- if (edac_handler_set()) {
- edac_atomic_assert_error();
- return;
- }
-#endif
-
if (panic_on_unrecovered_nmi)
nmi_panic(regs, "NMI: Not continuing");
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0c150c06fa5a..fda7867046d0 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1007,9 +1007,8 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
writel(cpu_to_be32(val32), target);
readl(target); /* flush */
- init_timer(&tbl->watchdog_timer);
- tbl->watchdog_timer.function = &calgary_watchdog;
- tbl->watchdog_timer.data = (unsigned long)dev;
+ setup_timer(&tbl->watchdog_timer, &calgary_watchdog,
+ (unsigned long)dev);
mod_timer(&tbl->watchdog_timer, jiffies);
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 067f9813fd2c..2544700a2a87 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
#endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
#if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 21a3db1b9fec..603a1669a2ec 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -70,6 +70,7 @@
#include <linux/tboot.h>
#include <linux/jiffies.h>
+#include <linux/usb/xhci-dbgp.h>
#include <video/edid.h>
#include <asm/mtrr.h>
@@ -811,6 +812,26 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
+static void __init simple_udelay_calibration(void)
+{
+ unsigned int tsc_khz, cpu_khz;
+ unsigned long lpj;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+
+ tsc_khz = tsc_khz ? : cpu_khz;
+ if (!tsc_khz)
+ return;
+
+ lpj = tsc_khz * 1000;
+ do_div(lpj, HZ);
+ loops_per_jiffy = lpj;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -959,6 +980,8 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
+ simple_udelay_calibration();
+
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1095,6 +1118,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
reserve_bios_regions();
if (efi_enabled(EFI_MEMMAP)) {
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 396c042e9d0e..cc30a74e4adb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -846,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index ec1f756f9dc9..71beb28600d4 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
if (from->si_signo == SIGSEGV) {
if (from->si_code == SEGV_BNDERR) {
- compat_uptr_t lower = (unsigned long)&to->si_lower;
- compat_uptr_t upper = (unsigned long)&to->si_upper;
+ compat_uptr_t lower = (unsigned long)from->si_lower;
+ compat_uptr_t upper = (unsigned long)from->si_upper;
put_user_ex(lower, &to->si_lower);
put_user_ex(upper, &to->si_upper);
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d3c66a15bbde..d798c0da451c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -33,6 +33,7 @@
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
+#include <asm/virtext.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false;
static void native_smp_send_reschedule(int cpu)
{
if (unlikely(cpu_is_offline(cpu))) {
- WARN_ON(1);
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
return;
}
apic->send_IPI(cpu, RESCHEDULE_VECTOR);
@@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
+ cpu_emergency_vmxoff();
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
asmlinkage __visible void smp_reboot_interrupt(void)
{
ipi_entering_ack_irq();
+ cpu_emergency_vmxoff();
stop_this_cpu(NULL);
irq_exit();
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 948443e115c1..3995d3a777d4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -169,6 +169,37 @@ void ist_end_non_atomic(void)
preempt_disable();
}
+int is_valid_bugaddr(unsigned long addr)
+{
+ unsigned short ud;
+
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ if (probe_kernel_address((unsigned short *)addr, ud))
+ return 0;
+
+ return ud == INSN_UD0 || ud == INSN_UD2;
+}
+
+static int fixup_bug(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr != X86_TRAP_UD)
+ return 0;
+
+ switch (report_bug(regs->ip, regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ case BUG_TRAP_TYPE_BUG:
+ break;
+
+ case BUG_TRAP_TYPE_WARN:
+ regs->ip += LEN_UD0;
+ return 1;
+ }
+
+ return 0;
+}
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
@@ -187,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
}
if (!user_mode(regs)) {
- if (!fixup_exception(regs, trapnr)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = trapnr;
- die(str, regs, error_code);
- }
- return 0;
+ if (fixup_exception(regs, trapnr))
+ return 0;
+
+ if (fixup_bug(regs, trapnr))
+ return 0;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
}
return -1;
@@ -255,7 +289,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
tsk->comm, tsk->pid, str,
regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
@@ -519,7 +553,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
tsk->comm, task_pid_nr(tsk),
regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 08339262b666..fec70fe3b1ec 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,6 +1,8 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/interrupt.h>
+#include <asm/sections.h>
#include <asm/ptrace.h>
#include <asm/bitops.h>
#include <asm/stacktrace.h>
@@ -23,53 +25,53 @@
val; \
})
-static void unwind_dump(struct unwind_state *state, unsigned long *sp)
+static void unwind_dump(struct unwind_state *state)
{
static bool dumped_before = false;
bool prev_zero, zero = false;
- unsigned long word;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
if (dumped_before)
return;
dumped_before = true;
- printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n",
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
state->stack_info.type, state->stack_info.next_sp,
state->stack_mask, state->graph_idx);
- for (sp = state->orig_sp; sp < state->stack_info.end; sp++) {
- word = READ_ONCE_NOCHECK(*sp);
+ for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
- prev_zero = zero;
- zero = word == 0;
+ for (; sp < stack_info.end; sp++) {
- if (zero) {
- if (!prev_zero)
- printk_deferred("%p: %016x ...\n", sp, 0);
- continue;
- }
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %0*x ...\n",
+ sp, BITS_PER_LONG/4, 0);
+ continue;
+ }
- printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word);
+ printk_deferred("%p: %0*lx (%pB)\n",
+ sp, BITS_PER_LONG/4, word, (void *)word);
+ }
}
}
unsigned long unwind_get_return_address(struct unwind_state *state)
{
- unsigned long addr;
- unsigned long *addr_p = unwind_get_return_address_ptr(state);
-
if (unwind_done(state))
return 0;
- if (state->regs && user_mode(state->regs))
- return 0;
-
- addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
- addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr,
- addr_p);
-
- return __kernel_text_address(addr) ? addr : 0;
+ return __kernel_text_address(state->ip) ? state->ip : 0;
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
@@ -82,16 +84,41 @@ static size_t regs_size(struct pt_regs *regs)
return sizeof(*regs);
}
+static bool in_entry_code(unsigned long ip)
+{
+ char *addr = (char *)ip;
+
+ if (addr >= __entry_text_start && addr < __entry_text_end)
+ return true;
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+ if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
+ return true;
+#endif
+
+ return false;
+}
+
+static inline unsigned long *last_frame(struct unwind_state *state)
+{
+ return (unsigned long *)task_pt_regs(state->task) - 2;
+}
+
#ifdef CONFIG_X86_32
#define GCC_REALIGN_WORDS 3
#else
#define GCC_REALIGN_WORDS 1
#endif
+static inline unsigned long *last_aligned_frame(struct unwind_state *state)
+{
+ return last_frame(state) - GCC_REALIGN_WORDS;
+}
+
static bool is_last_task_frame(struct unwind_state *state)
{
- unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2;
- unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS;
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *aligned_bp = last_aligned_frame(state);
/*
* We have to check for the last task frame at two different locations
@@ -135,26 +162,70 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
return (struct pt_regs *)(regs & ~0x1);
}
-static bool update_stack_state(struct unwind_state *state, void *addr,
- size_t len)
+static bool update_stack_state(struct unwind_state *state,
+ unsigned long *next_bp)
{
struct stack_info *info = &state->stack_info;
- enum stack_type orig_type = info->type;
+ enum stack_type prev_type = info->type;
+ struct pt_regs *regs;
+ unsigned long *frame, *prev_frame_end, *addr_p, addr;
+ size_t len;
+
+ if (state->regs)
+ prev_frame_end = (void *)state->regs + regs_size(state->regs);
+ else
+ prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
+
+ /* Is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ frame = (unsigned long *)regs;
+ len = regs_size(regs);
+ state->got_irq = true;
+ } else {
+ frame = next_bp;
+ len = FRAME_HEADER_SIZE;
+ }
/*
- * If addr isn't on the current stack, switch to the next one.
+ * If the next bp isn't on the current stack, switch to the next one.
*
* We may have to traverse multiple stacks to deal with the possibility
- * that 'info->next_sp' could point to an empty stack and 'addr' could
- * be on a subsequent stack.
+ * that info->next_sp could point to an empty stack and the next bp
+ * could be on a subsequent stack.
*/
- while (!on_stack(info, addr, len))
+ while (!on_stack(info, frame, len))
if (get_stack_info(info->next_sp, state->task, info,
&state->stack_mask))
return false;
- if (!state->orig_sp || info->type != orig_type)
- state->orig_sp = addr;
+ /* Make sure it only unwinds up and doesn't overlap the prev frame: */
+ if (state->orig_sp && state->stack_info.type == prev_type &&
+ frame < prev_frame_end)
+ return false;
+
+ /* Move state to the next frame: */
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
+ /* Save the return address: */
+ if (state->regs && user_mode(state->regs))
+ state->ip = 0;
+ else {
+ addr_p = unwind_get_return_address_ptr(state);
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ }
+
+ /* Save the original stack pointer for unwind_dump(): */
+ if (!state->orig_sp)
+ state->orig_sp = frame;
return true;
}
@@ -162,14 +233,12 @@ static bool update_stack_state(struct unwind_state *state, void *addr,
bool unwind_next_frame(struct unwind_state *state)
{
struct pt_regs *regs;
- unsigned long *next_bp, *next_frame;
- size_t next_len;
- enum stack_type prev_type = state->stack_info.type;
+ unsigned long *next_bp;
if (unwind_done(state))
return false;
- /* have we reached the end? */
+ /* Have we reached the end? */
if (state->regs && user_mode(state->regs))
goto the_end;
@@ -197,54 +266,19 @@ bool unwind_next_frame(struct unwind_state *state)
*/
state->regs = regs;
state->bp = NULL;
+ state->ip = 0;
return true;
}
- /* get the next frame pointer */
+ /* Get the next frame pointer: */
if (state->regs)
next_bp = (unsigned long *)state->regs->bp;
else
- next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp);
-
- /* is the next frame pointer an encoded pointer to pt_regs? */
- regs = decode_frame_pointer(next_bp);
- if (regs) {
- next_frame = (unsigned long *)regs;
- next_len = sizeof(*regs);
- } else {
- next_frame = next_bp;
- next_len = FRAME_HEADER_SIZE;
- }
-
- /* make sure the next frame's data is accessible */
- if (!update_stack_state(state, next_frame, next_len)) {
- /*
- * Don't warn on bad regs->bp. An interrupt in entry code
- * might cause a false positive warning.
- */
- if (state->regs)
- goto the_end;
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ /* Move to the next frame if it's safe: */
+ if (!update_stack_state(state, next_bp))
goto bad_address;
- }
-
- /* Make sure it only unwinds up and doesn't overlap the last frame: */
- if (state->stack_info.type == prev_type) {
- if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs))
- goto bad_address;
-
- if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE)
- goto bad_address;
- }
-
- /* move to the next frame */
- if (regs) {
- state->regs = regs;
- state->bp = NULL;
- } else {
- state->bp = next_bp;
- state->regs = NULL;
- }
return true;
@@ -259,18 +293,29 @@ bad_address:
if (state->task != current)
goto the_end;
+ /*
+ * Don't warn if the unwinder got lost due to an interrupt in entry
+ * code or in the C handler before the first frame pointer got set up:
+ */
+ if (state->got_irq && in_entry_code(state->ip))
+ goto the_end;
+ if (state->regs &&
+ state->regs->sp >= (unsigned long)last_aligned_frame(state) &&
+ state->regs->sp < (unsigned long)task_pt_regs(state->task))
+ goto the_end;
+
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
state->regs, state->task->comm,
- state->task->pid, next_frame);
- unwind_dump(state, (unsigned long *)state->regs);
+ state->task->pid, next_bp);
+ unwind_dump(state);
} else {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
state->bp, state->task->comm,
- state->task->pid, next_frame);
- unwind_dump(state, state->bp);
+ state->task->pid, next_bp);
+ unwind_dump(state);
}
the_end:
state->stack_info.type = STACK_TYPE_UNKNOWN;
@@ -281,35 +326,24 @@ EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
- unsigned long *bp, *frame;
- size_t len;
+ unsigned long *bp;
memset(state, 0, sizeof(*state));
state->task = task;
+ state->got_irq = (regs);
- /* don't even attempt to start from user mode regs */
+ /* Don't even attempt to start from user mode regs: */
if (regs && user_mode(regs)) {
state->stack_info.type = STACK_TYPE_UNKNOWN;
return;
}
- /* set up the starting stack frame */
bp = get_frame_pointer(task, regs);
- regs = decode_frame_pointer(bp);
- if (regs) {
- state->regs = regs;
- frame = (unsigned long *)regs;
- len = sizeof(*regs);
- } else {
- state->bp = bp;
- frame = bp;
- len = FRAME_HEADER_SIZE;
- }
- /* initialize stack info and make sure the frame data is accessible */
- get_stack_info(frame, state->task, &state->stack_info,
+ /* Initialize stack info and make sure the frame data is accessible: */
+ get_stack_info(bp, state->task, &state->stack_info,
&state->stack_mask);
- update_stack_state(state, frame, len);
+ update_stack_state(state, bp);
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 22881ddcbb9f..039f36738e49 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -34,7 +34,7 @@ bool unwind_next_frame(struct unwind_state *state)
return true;
}
- state->sp = info->next_sp;
+ state->sp = PTR_ALIGN(info->next_sp, sizeof(long));
} while (!get_stack_info(state->sp, state->task, info,
&state->stack_mask));
@@ -49,7 +49,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
memset(state, 0, sizeof(*state));
state->task = task;
- state->sp = first_frame;
+ state->sp = PTR_ALIGN(first_frame, sizeof(long));
get_stack_info(first_frame, state->task, &state->stack_info,
&state->stack_mask);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c74ae9ce8dc4..c8a3b61be0aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -146,6 +146,7 @@ SECTIONS
_edata = .;
} :data
+ BUG_TABLE
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 535cc065b844..1a471e5f963f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8194,6 +8194,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
case EXIT_REASON_PREEMPTION_TIMER:
return false;
+ case EXIT_REASON_PML_FULL:
+ /* We don't expose PML support to L1. */
+ return false;
default:
return true;
}
@@ -10263,6 +10266,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
+ if (enable_pml) {
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit, this happens
+ * to be equivalent to simply resetting the fields in vmcs02.
+ */
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
if (nested_cpu_has_ept(vmcs12)) {
kvm_mmu_unload(vcpu);
nested_ept_init_mmu_context(vcpu);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7aa633686295..99472698c931 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -994,7 +994,9 @@ static struct clock_event_device lguest_clockevent = {
.mult = 1,
.shift = 0,
.min_delta_ns = LG_CLOCK_MIN_DELTA,
+ .min_delta_ticks = LG_CLOCK_MIN_DELTA,
.max_delta_ns = LG_CLOCK_MAX_DELTA,
+ .max_delta_ticks = LG_CLOCK_MAX_DELTA,
};
/*
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 5e2af3a88cf5..81b1635d67de 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -14,20 +14,15 @@
* Zero a page.
* %rdi - page
*/
-ENTRY(clear_page)
-
- ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp clear_page_c_e", X86_FEATURE_ERMS
-
+ENTRY(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
-ENDPROC(clear_page)
-EXPORT_SYMBOL(clear_page)
+ENDPROC(clear_page_rep)
+EXPORT_SYMBOL_GPL(clear_page_rep)
ENTRY(clear_page_orig)
-
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -47,10 +42,12 @@ ENTRY(clear_page_orig)
nop
ret
ENDPROC(clear_page_orig)
+EXPORT_SYMBOL_GPL(clear_page_orig)
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
-ENDPROC(clear_page_c_e)
+ENDPROC(clear_page_erms)
+EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index a8e91ae89fb3..29df077cb089 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops)
{
u64 start, end, delay, loops = __loops;
+ /*
+ * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+ * is a store on the memory monitored by MONITORX.
+ */
+ if (loops == 0)
+ return;
+
start = rdtsc_ordered();
for (;;) {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c074799bddae..c8c6ad0d58b8 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -4,12 +4,9 @@
* For licencing details see kernel-base/COPYING
*/
-#include <linux/highmem.h>
+#include <linux/uaccess.h>
#include <linux/export.h>
-#include <asm/word-at-a-time.h>
-#include <linux/sched.h>
-
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -34,52 +31,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
return ret;
}
EXPORT_SYMBOL_GPL(copy_from_user_nmi);
-
-/**
- * copy_to_user: - Copy a block of data into user space.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
-{
- if (access_ok(VERIFY_WRITE, to, n))
- n = __copy_to_user(to, from, n);
- return n;
-}
-EXPORT_SYMBOL(_copy_to_user);
-
-/**
- * copy_from_user: - Copy a block of data from user space.
- * @to: Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
-{
- if (access_ok(VERIFY_READ, from, n))
- n = __copy_from_user(to, from, n);
- else
- memset(to, 0, n);
- return n;
-}
-EXPORT_SYMBOL(_copy_from_user);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f65ff6540f0..bd057a4ffe6e 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -5,12 +5,7 @@
* Copyright 1997 Andi Kleen <ak@muc.de>
* Copyright 1997 Linus Torvalds
*/
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/blkdev.h>
#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/interrupt.h>
#include <linux/uaccess.h>
#include <asm/mmx.h>
#include <asm/asm.h>
@@ -201,197 +196,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
return size;
}
-static unsigned long
-__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
-{
- int d0, d1;
- __asm__ __volatile__(
- " .align 2,0x90\n"
- "0: movl 32(%4), %%eax\n"
- " cmpl $67, %0\n"
- " jbe 2f\n"
- "1: movl 64(%4), %%eax\n"
- " .align 2,0x90\n"
- "2: movl 0(%4), %%eax\n"
- "21: movl 4(%4), %%edx\n"
- " movl %%eax, 0(%3)\n"
- " movl %%edx, 4(%3)\n"
- "3: movl 8(%4), %%eax\n"
- "31: movl 12(%4),%%edx\n"
- " movl %%eax, 8(%3)\n"
- " movl %%edx, 12(%3)\n"
- "4: movl 16(%4), %%eax\n"
- "41: movl 20(%4), %%edx\n"
- " movl %%eax, 16(%3)\n"
- " movl %%edx, 20(%3)\n"
- "10: movl 24(%4), %%eax\n"
- "51: movl 28(%4), %%edx\n"
- " movl %%eax, 24(%3)\n"
- " movl %%edx, 28(%3)\n"
- "11: movl 32(%4), %%eax\n"
- "61: movl 36(%4), %%edx\n"
- " movl %%eax, 32(%3)\n"
- " movl %%edx, 36(%3)\n"
- "12: movl 40(%4), %%eax\n"
- "71: movl 44(%4), %%edx\n"
- " movl %%eax, 40(%3)\n"
- " movl %%edx, 44(%3)\n"
- "13: movl 48(%4), %%eax\n"
- "81: movl 52(%4), %%edx\n"
- " movl %%eax, 48(%3)\n"
- " movl %%edx, 52(%3)\n"
- "14: movl 56(%4), %%eax\n"
- "91: movl 60(%4), %%edx\n"
- " movl %%eax, 56(%3)\n"
- " movl %%edx, 60(%3)\n"
- " addl $-64, %0\n"
- " addl $64, %4\n"
- " addl $64, %3\n"
- " cmpl $63, %0\n"
- " ja 0b\n"
- "5: movl %0, %%eax\n"
- " shrl $2, %0\n"
- " andl $3, %%eax\n"
- " cld\n"
- "6: rep; movsl\n"
- " movl %%eax,%0\n"
- "7: rep; movsb\n"
- "8:\n"
- ".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: pushl %0\n"
- " pushl %%eax\n"
- " xorl %%eax,%%eax\n"
- " rep; stosb\n"
- " popl %%eax\n"
- " popl %0\n"
- " jmp 8b\n"
- ".previous\n"
- _ASM_EXTABLE(0b,16b)
- _ASM_EXTABLE(1b,16b)
- _ASM_EXTABLE(2b,16b)
- _ASM_EXTABLE(21b,16b)
- _ASM_EXTABLE(3b,16b)
- _ASM_EXTABLE(31b,16b)
- _ASM_EXTABLE(4b,16b)
- _ASM_EXTABLE(41b,16b)
- _ASM_EXTABLE(10b,16b)
- _ASM_EXTABLE(51b,16b)
- _ASM_EXTABLE(11b,16b)
- _ASM_EXTABLE(61b,16b)
- _ASM_EXTABLE(12b,16b)
- _ASM_EXTABLE(71b,16b)
- _ASM_EXTABLE(13b,16b)
- _ASM_EXTABLE(81b,16b)
- _ASM_EXTABLE(14b,16b)
- _ASM_EXTABLE(91b,16b)
- _ASM_EXTABLE(6b,9b)
- _ASM_EXTABLE(7b,16b)
- : "=&c"(size), "=&D" (d0), "=&S" (d1)
- : "1"(to), "2"(from), "0"(size)
- : "eax", "edx", "memory");
- return size;
-}
-
-/*
- * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware.
- * hyoshiok@miraclelinux.com
- */
-
-static unsigned long __copy_user_zeroing_intel_nocache(void *to,
- const void __user *from, unsigned long size)
-{
- int d0, d1;
-
- __asm__ __volatile__(
- " .align 2,0x90\n"
- "0: movl 32(%4), %%eax\n"
- " cmpl $67, %0\n"
- " jbe 2f\n"
- "1: movl 64(%4), %%eax\n"
- " .align 2,0x90\n"
- "2: movl 0(%4), %%eax\n"
- "21: movl 4(%4), %%edx\n"
- " movnti %%eax, 0(%3)\n"
- " movnti %%edx, 4(%3)\n"
- "3: movl 8(%4), %%eax\n"
- "31: movl 12(%4),%%edx\n"
- " movnti %%eax, 8(%3)\n"
- " movnti %%edx, 12(%3)\n"
- "4: movl 16(%4), %%eax\n"
- "41: movl 20(%4), %%edx\n"
- " movnti %%eax, 16(%3)\n"
- " movnti %%edx, 20(%3)\n"
- "10: movl 24(%4), %%eax\n"
- "51: movl 28(%4), %%edx\n"
- " movnti %%eax, 24(%3)\n"
- " movnti %%edx, 28(%3)\n"
- "11: movl 32(%4), %%eax\n"
- "61: movl 36(%4), %%edx\n"
- " movnti %%eax, 32(%3)\n"
- " movnti %%edx, 36(%3)\n"
- "12: movl 40(%4), %%eax\n"
- "71: movl 44(%4), %%edx\n"
- " movnti %%eax, 40(%3)\n"
- " movnti %%edx, 44(%3)\n"
- "13: movl 48(%4), %%eax\n"
- "81: movl 52(%4), %%edx\n"
- " movnti %%eax, 48(%3)\n"
- " movnti %%edx, 52(%3)\n"
- "14: movl 56(%4), %%eax\n"
- "91: movl 60(%4), %%edx\n"
- " movnti %%eax, 56(%3)\n"
- " movnti %%edx, 60(%3)\n"
- " addl $-64, %0\n"
- " addl $64, %4\n"
- " addl $64, %3\n"
- " cmpl $63, %0\n"
- " ja 0b\n"
- " sfence \n"
- "5: movl %0, %%eax\n"
- " shrl $2, %0\n"
- " andl $3, %%eax\n"
- " cld\n"
- "6: rep; movsl\n"
- " movl %%eax,%0\n"
- "7: rep; movsb\n"
- "8:\n"
- ".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: pushl %0\n"
- " pushl %%eax\n"
- " xorl %%eax,%%eax\n"
- " rep; stosb\n"
- " popl %%eax\n"
- " popl %0\n"
- " jmp 8b\n"
- ".previous\n"
- _ASM_EXTABLE(0b,16b)
- _ASM_EXTABLE(1b,16b)
- _ASM_EXTABLE(2b,16b)
- _ASM_EXTABLE(21b,16b)
- _ASM_EXTABLE(3b,16b)
- _ASM_EXTABLE(31b,16b)
- _ASM_EXTABLE(4b,16b)
- _ASM_EXTABLE(41b,16b)
- _ASM_EXTABLE(10b,16b)
- _ASM_EXTABLE(51b,16b)
- _ASM_EXTABLE(11b,16b)
- _ASM_EXTABLE(61b,16b)
- _ASM_EXTABLE(12b,16b)
- _ASM_EXTABLE(71b,16b)
- _ASM_EXTABLE(13b,16b)
- _ASM_EXTABLE(81b,16b)
- _ASM_EXTABLE(14b,16b)
- _ASM_EXTABLE(91b,16b)
- _ASM_EXTABLE(6b,9b)
- _ASM_EXTABLE(7b,16b)
- : "=&c"(size), "=&D" (d0), "=&S" (d1)
- : "1"(to), "2"(from), "0"(size)
- : "eax", "edx", "memory");
- return size;
-}
-
static unsigned long __copy_user_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
@@ -486,12 +290,8 @@ static unsigned long __copy_user_intel_nocache(void *to,
* Leave these declared but undefined. They should not be any references to
* them
*/
-unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
- unsigned long size);
unsigned long __copy_user_intel(void __user *to, const void *from,
unsigned long size);
-unsigned long __copy_user_zeroing_intel_nocache(void *to,
- const void __user *from, unsigned long size);
#endif /* CONFIG_X86_INTEL_USERCOPY */
/* Generic arbitrary sized copy. */
@@ -528,47 +328,7 @@ do { \
: "memory"); \
} while (0)
-#define __copy_user_zeroing(to, from, size) \
-do { \
- int __d0, __d1, __d2; \
- __asm__ __volatile__( \
- " cmp $7,%0\n" \
- " jbe 1f\n" \
- " movl %1,%0\n" \
- " negl %0\n" \
- " andl $7,%0\n" \
- " subl %0,%3\n" \
- "4: rep; movsb\n" \
- " movl %3,%0\n" \
- " shrl $2,%0\n" \
- " andl $3,%3\n" \
- " .align 2,0x90\n" \
- "0: rep; movsl\n" \
- " movl %3,%0\n" \
- "1: rep; movsb\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "5: addl %3,%0\n" \
- " jmp 6f\n" \
- "3: lea 0(%3,%0,4),%0\n" \
- "6: pushl %0\n" \
- " pushl %%eax\n" \
- " xorl %%eax,%%eax\n" \
- " rep; stosb\n" \
- " popl %%eax\n" \
- " popl %0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(4b,5b) \
- _ASM_EXTABLE(0b,3b) \
- _ASM_EXTABLE(1b,6b) \
- : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
- : "3"(size), "0"(size), "1"(to), "2"(from) \
- : "memory"); \
-} while (0)
-
-unsigned long __copy_to_user_ll(void __user *to, const void *from,
- unsigned long n)
+unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
{
stac();
if (movsl_is_ok(to, from, n))
@@ -578,51 +338,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from,
clac();
return n;
}
-EXPORT_SYMBOL(__copy_to_user_ll);
-
-unsigned long __copy_from_user_ll(void *to, const void __user *from,
- unsigned long n)
-{
- stac();
- if (movsl_is_ok(to, from, n))
- __copy_user_zeroing(to, from, n);
- else
- n = __copy_user_zeroing_intel(to, from, n);
- clac();
- return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll);
-
-unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
- unsigned long n)
-{
- stac();
- if (movsl_is_ok(to, from, n))
- __copy_user(to, from, n);
- else
- n = __copy_user_intel((void __user *)to,
- (const void *)from, n);
- clac();
- return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nozero);
-
-unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
- unsigned long n)
-{
- stac();
-#ifdef CONFIG_X86_INTEL_USERCOPY
- if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
- n = __copy_user_zeroing_intel_nocache(to, from, n);
- else
- __copy_user_zeroing(to, from, n);
-#else
- __copy_user_zeroing(to, from, n);
-#endif
- clac();
- return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nocache);
+EXPORT_SYMBOL(__copy_user_ll);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 69873589c0ba..3b7c40a2e3e1 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -54,15 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
}
EXPORT_SYMBOL(clear_user);
-unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
-{
- if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
- return copy_user_generic((__force void *)to, (__force void *)from, len);
- }
- return len;
-}
-EXPORT_SYMBOL(copy_in_user);
-
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
@@ -80,9 +71,5 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
break;
}
clac();
-
- /* If the destination is a kernel buffer, we always clear the end */
- if (!__addr_ok(to))
- memset(to, 0, len);
return len;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2193799ca800..138bad2fb6bc 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -643,21 +643,40 @@ void __init init_mem_mapping(void)
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.
*
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains BIOS code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
+ * On x86, access has to be given to the first megabyte of RAM because that
+ * area traditionally contains BIOS code and data regions used by X, dosemu,
+ * and similar apps. Since they map the entire memory range, the whole range
+ * must be allowed (for mapping), but any areas that would otherwise be
+ * disallowed are flagged as being "zero filled" instead of rejected.
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
*/
int devmem_is_allowed(unsigned long pagenr)
{
- if (pagenr < 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ if (page_is_ram(pagenr)) {
+ /*
+ * For disallowed memory regions in the low 1MB range,
+ * request that the page be shown as all zeros.
+ */
+ if (pagenr < 256)
+ return 2;
+
+ return 0;
+ }
+
+ /*
+ * This must follow RAM test, since System RAM is considered a
+ * restricted resource under CONFIG_STRICT_IOMEM.
+ */
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
+ /* Low 1MB bypasses iomem restrictions. */
+ if (pagenr < 256)
+ return 1;
+
return 0;
- if (!page_is_ram(pagenr))
- return 1;
- return 0;
+ }
+
+ return 1;
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 066619b0700c..f1d83b34c329 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,6 +1,5 @@
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
deleted file mode 100644
index 04ca8764f0c0..000000000000
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2012 Intel Corporation
- * Author: Josh Triplett <josh@joshtriplett.org>
- *
- * Based on the bgrt driver:
- * Copyright 2012 Red Hat, Inc <mjg@redhat.com>
- * Author: Matthew Garrett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/efi-bgrt.h>
-
-struct acpi_table_bgrt bgrt_tab;
-size_t __initdata bgrt_image_size;
-
-struct bmp_header {
- u16 id;
- u32 size;
-} __packed;
-
-void __init efi_bgrt_init(struct acpi_table_header *table)
-{
- void *image;
- struct bmp_header bmp_header;
- struct acpi_table_bgrt *bgrt = &bgrt_tab;
-
- if (acpi_disabled)
- return;
-
- if (table->length < sizeof(bgrt_tab)) {
- pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n",
- table->length, sizeof(bgrt_tab));
- return;
- }
- *bgrt = *(struct acpi_table_bgrt *)table;
- if (bgrt->version != 1) {
- pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n",
- bgrt->version);
- goto out;
- }
- if (bgrt->status & 0xfe) {
- pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n",
- bgrt->status);
- goto out;
- }
- if (bgrt->image_type != 0) {
- pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n",
- bgrt->image_type);
- goto out;
- }
- if (!bgrt->image_address) {
- pr_notice("Ignoring BGRT: null image address\n");
- goto out;
- }
-
- image = early_memremap(bgrt->image_address, sizeof(bmp_header));
- if (!image) {
- pr_notice("Ignoring BGRT: failed to map image header memory\n");
- goto out;
- }
-
- memcpy(&bmp_header, image, sizeof(bmp_header));
- early_memunmap(image, sizeof(bmp_header));
- if (bmp_header.id != 0x4d42) {
- pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n",
- bmp_header.id);
- goto out;
- }
- bgrt_image_size = bmp_header.size;
- efi_mem_reserve(bgrt->image_address, bgrt_image_size);
-
- return;
-out:
- memset(bgrt, 0, sizeof(bgrt_tab));
-}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6b6b8e8d4ae7..c488625c9712 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,7 +47,7 @@
#include <asm/pgalloc.h>
/*
- * We allocate runtime services regions bottom-up, starting from -4G, i.e.
+ * We allocate runtime services regions top-down, starting from -4G, i.e.
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 3c8d8e511fd4..26615991d69c 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -203,6 +203,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
+ /* No need to reserve regions that will never be freed. */
+ if (md.attribute & EFI_MEMORY_RUNTIME)
+ return;
+
size += addr % EFI_PAGE_SIZE;
size = round_up(size, EFI_PAGE_SIZE);
addr = round_down(addr, EFI_PAGE_SIZE);
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 3dbde04febdc..53e0235e308f 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -2,8 +2,9 @@
obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
# SDHCI Devices
obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
-# WiFi
+# WiFi + BT
obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
+obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
# IPC Devices
obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
new file mode 100644
index 000000000000..5a0483e7bf66
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -0,0 +1,108 @@
+/*
+ * Bluetooth platform data initialization file
+ *
+ * (C) Copyright 2017 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio/machine.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/intel-mid.h>
+
+struct bt_sfi_data {
+ struct device *dev;
+ const char *name;
+ int (*setup)(struct bt_sfi_data *ddata);
+};
+
+static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
+ .dev_id = "hci_bcm",
+ .table = {
+ GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
+#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup"
+#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset"
+#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable"
+
+static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
+{
+ struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
+ struct gpiod_lookup *lookup = table->table;
+ struct pci_dev *pdev;
+
+ /* Connected to /dev/ttyS0 */
+ pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
+ if (!pdev)
+ return -ENODEV;
+
+ ddata->dev = &pdev->dev;
+ ddata->name = table->dev_id;
+
+ lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
+ lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
+ lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
+
+ gpiod_add_lookup_table(table);
+ return 0;
+}
+
+static struct bt_sfi_data tng_bt_sfi_data __initdata = {
+ .setup = tng_bt_sfi_setup,
+};
+
+#define ICPU(model, ddata) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata }
+
+static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data),
+ {}
+};
+
+static int __init bt_sfi_init(void)
+{
+ struct platform_device_info info;
+ struct platform_device *pdev;
+ const struct x86_cpu_id *id;
+ struct bt_sfi_data *ddata;
+ int ret;
+
+ id = x86_match_cpu(bt_sfi_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ ddata = (struct bt_sfi_data *)id->driver_data;
+ if (!ddata)
+ return -ENODEV;
+
+ ret = ddata->setup(ddata);
+ if (ret)
+ return ret;
+
+ memset(&info, 0, sizeof(info));
+ info.fwnode = ddata->dev->fwnode;
+ info.parent = ddata->dev;
+ info.name = ddata->name,
+ info.id = PLATFORM_DEVID_NONE,
+
+ pdev = platform_device_register_full(&info);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
+ return 0;
+}
+device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index f25982cdff90..42e65fee5673 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -23,28 +23,7 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
-static struct bau_operations ops;
-
-static struct bau_operations uv123_bau_ops = {
- .bau_gpa_to_offset = uv_gpa_to_offset,
- .read_l_sw_ack = read_mmr_sw_ack,
- .read_g_sw_ack = read_gmmr_sw_ack,
- .write_l_sw_ack = write_mmr_sw_ack,
- .write_g_sw_ack = write_gmmr_sw_ack,
- .write_payload_first = write_mmr_payload_first,
- .write_payload_last = write_mmr_payload_last,
-};
-
-static struct bau_operations uv4_bau_ops = {
- .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
- .read_l_sw_ack = read_mmr_proc_sw_ack,
- .read_g_sw_ack = read_gmmr_proc_sw_ack,
- .write_l_sw_ack = write_mmr_proc_sw_ack,
- .write_g_sw_ack = write_gmmr_proc_sw_ack,
- .write_payload_first = write_mmr_proc_payload_first,
- .write_payload_last = write_mmr_proc_payload_last,
-};
-
+static struct bau_operations ops __ro_after_init;
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
@@ -548,11 +527,12 @@ static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
* return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
static int uv1_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
unsigned long descriptor_status;
cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
struct ptc_stats *stat = bcp->statp;
descriptor_status = uv1_read_status(mmr_offset, right_shift);
@@ -640,11 +620,12 @@ int handle_uv2_busy(struct bau_control *bcp)
}
static int uv2_3_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
unsigned long descriptor_stat;
cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
int desc = bcp->uvhub_cpu;
long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;
@@ -706,28 +687,59 @@ static int uv2_3_wait_completion(struct bau_desc *bau_desc,
}
/*
- * There are 2 status registers; each and array[32] of 2 bits. Set up for
- * which register to read and position in that register based on cpu in
- * current hub.
+ * Returns the status of current BAU message for cpu desc as a bit field
+ * [Error][Busy][Aux]
*/
-static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
+static u64 read_status(u64 status_mmr, int index, int desc)
{
- int right_shift;
- unsigned long mmr_offset;
+ u64 stat;
+
+ stat = ((read_lmmr(status_mmr) >> index) & UV_ACT_STATUS_MASK) << 1;
+ stat |= (read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_2) >> desc) & 0x1;
+
+ return stat;
+}
+
+static int uv4_wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
+{
+ struct ptc_stats *stat = bcp->statp;
+ u64 descriptor_stat;
+ u64 mmr = bcp->status_mmr;
+ int index = bcp->status_index;
int desc = bcp->uvhub_cpu;
- if (desc < UV_CPUS_PER_AS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = desc * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
- }
+ descriptor_stat = read_status(mmr, index, desc);
- if (bcp->uvhub_version == 1)
- return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
- else
- return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ switch (descriptor_stat) {
+ case UV2H_DESC_SOURCE_TIMEOUT:
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+
+ case UV2H_DESC_DEST_TIMEOUT:
+ stat->s_dtimeout++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+
+ case UV2H_DESC_DEST_STRONG_NACK:
+ stat->s_plugged++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+
+ case UV2H_DESC_DEST_PUT_ERR:
+ bcp->conseccompletes = 0;
+ return FLUSH_GIVEUP;
+
+ default:
+ /* descriptor_stat is still BUSY */
+ cpu_relax();
+ }
+ descriptor_stat = read_status(mmr, index, desc);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
}
/*
@@ -918,7 +930,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
struct uv1_bau_msg_header *uv1_hdr = NULL;
struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
- if (bcp->uvhub_version == 1) {
+ if (bcp->uvhub_version == UV_BAU_V1) {
uv1 = 1;
uv1_throttle(hmaster, stat);
}
@@ -958,7 +970,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
write_mmr_activation(index);
try++;
- completion_stat = wait_completion(bau_desc, bcp, try);
+ completion_stat = ops.wait_completion(bau_desc, bcp, try);
handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
@@ -1114,15 +1126,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
unsigned long end,
unsigned int cpu)
{
- int locals = 0;
- int remotes = 0;
- int hubs = 0;
+ int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
- unsigned long descriptor_status;
- unsigned long status;
+ unsigned long descriptor_status, status, address;
bcp = &per_cpu(bau_control, cpu);
@@ -1171,10 +1180,24 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
if (!end || (end - start) <= PAGE_SIZE)
- bau_desc->payload.address = start;
+ address = start;
else
- bau_desc->payload.address = TLB_FLUSH_ALL;
- bau_desc->payload.sending_cpu = cpu;
+ address = TLB_FLUSH_ALL;
+
+ switch (bcp->uvhub_version) {
+ case UV_BAU_V1:
+ case UV_BAU_V2:
+ case UV_BAU_V3:
+ bau_desc->payload.uv1_2_3.address = address;
+ bau_desc->payload.uv1_2_3.sending_cpu = cpu;
+ break;
+ case UV_BAU_V4:
+ bau_desc->payload.uv4.address = address;
+ bau_desc->payload.uv4.sending_cpu = cpu;
+ bau_desc->payload.uv4.qualifier = BAU_DESC_QUALIFIER;
+ break;
+ }
+
/*
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
* or 1 if it gave up and the original cpumask should be returned.
@@ -1296,7 +1319,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
msgdesc.msg_slot = msg - msgdesc.queue_first;
msgdesc.msg = msg;
- if (bcp->uvhub_version == 2)
+ if (bcp->uvhub_version == UV_BAU_V2)
process_uv2_message(&msgdesc, bcp);
else
/* no error workaround for uv1 or uv3 */
@@ -1838,7 +1861,7 @@ static void pq_init(int node, int pnode)
* and the payload queue tail must be maintained by the kernel.
*/
bcp = &per_cpu(bau_control, smp_processor_id());
- if (bcp->uvhub_version <= 3) {
+ if (bcp->uvhub_version <= UV_BAU_V3) {
tail = first;
gnode = uv_gpa_to_gnode(uv_gpa(pqp));
first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
@@ -2034,8 +2057,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
struct bau_control **smasterp,
struct bau_control **hmasterp)
{
- int i;
- int cpu;
+ int i, cpu, uvhub_cpu;
struct bau_control *bcp;
for (i = 0; i < sdp->num_cpus; i++) {
@@ -2052,19 +2074,33 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->socket_master = *smasterp;
bcp->uvhub = bdp->uvhub;
if (is_uv1_hub())
- bcp->uvhub_version = 1;
+ bcp->uvhub_version = UV_BAU_V1;
else if (is_uv2_hub())
- bcp->uvhub_version = 2;
+ bcp->uvhub_version = UV_BAU_V2;
else if (is_uv3_hub())
- bcp->uvhub_version = 3;
+ bcp->uvhub_version = UV_BAU_V3;
else if (is_uv4_hub())
- bcp->uvhub_version = 4;
+ bcp->uvhub_version = UV_BAU_V4;
else {
pr_emerg("uvhub version not 1, 2, 3, or 4\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
- bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+ uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+ bcp->uvhub_cpu = uvhub_cpu;
+
+ /*
+ * The ERROR and BUSY status registers are located pairwise over
+ * the STATUS_0 and STATUS_1 mmrs; each an array[32] of 2 bits.
+ */
+ if (uvhub_cpu < UV_CPUS_PER_AS) {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ bcp->status_index = uvhub_cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ bcp->status_index = (uvhub_cpu - UV_CPUS_PER_AS)
+ * UV_ACT_STATUS_SIZE;
+ }
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
pr_emerg("%d cpus per uvhub invalid\n",
@@ -2147,6 +2183,39 @@ fail:
return 1;
}
+static const struct bau_operations uv1_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv1_wait_completion,
+};
+
+static const struct bau_operations uv2_3_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv2_3_wait_completion,
+};
+
+static const struct bau_operations uv4_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
+ .read_l_sw_ack = read_mmr_proc_sw_ack,
+ .read_g_sw_ack = read_gmmr_proc_sw_ack,
+ .write_l_sw_ack = write_mmr_proc_sw_ack,
+ .write_g_sw_ack = write_gmmr_proc_sw_ack,
+ .write_payload_first = write_mmr_proc_payload_first,
+ .write_payload_last = write_mmr_proc_payload_last,
+ .wait_completion = uv4_wait_completion,
+};
+
/*
* Initialization of BAU-related structures
*/
@@ -2166,11 +2235,11 @@ static int __init uv_bau_init(void)
if (is_uv4_hub())
ops = uv4_bau_ops;
else if (is_uv3_hub())
- ops = uv123_bau_ops;
+ ops = uv2_3_bau_ops;
else if (is_uv2_hub())
- ops = uv123_bau_ops;
+ ops = uv2_3_bau_ops;
else if (is_uv1_hub())
- ops = uv123_bau_ops;
+ ops = uv1_bau_ops;
for_each_possible_cpu(cur_cpu) {
mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 2ee7632d4916..b082d71b08ee 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -390,9 +390,11 @@ static __init int uv_rtc_setup_clock(void)
clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
sn_rtc_cycles_per_second;
+ clock_event_device_uv.min_delta_ticks = 1;
clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+ clock_event_device_uv.max_delta_ticks = clocksource_uv.mask;
rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
if (rc) {
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
aspects of the MCE handling code.
WARNING: Do not even assume this interface is staying stable!
+
+config RAS_CEC
+ bool "Correctable Errors Collector"
+ depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+ ---help---
+ This is a small cache which collects correctable memory errors per 4K
+ page PFN and counts their repeated occurrence. Once the counter for a
+ PFN overflows, we try to soft-offline that page as we take it to mean
+ that it has reached a relatively high error count and would probably
+ be best if we don't use it anymore.
+
+ Bear in mind that this is absolutely useless if your platform doesn't
+ have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 69f0827d5f53..46cbbfe03285 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -8,7 +8,7 @@ else
BITS := 64
endif
-obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
+obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c
deleted file mode 100644
index e8034e363d83..000000000000
--- a/arch/x86/um/bug.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com)
- * Licensed under the GPL V2
- */
-
-#include <linux/uaccess.h>
-
-/*
- * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because
- * that's not relevant in skas mode.
- */
-
-int is_valid_bugaddr(unsigned long eip)
-{
- unsigned short ud2;
-
- if (probe_kernel_address((unsigned short __user *)eip, ud2))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1e69956d7852..7a3089285c59 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = {
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
.mult = 1,
.shift = 0,
@@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
.mult = 1,
.shift = 0,