188 files changed, 4155 insertions, 2712 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06dbb5da90c6..7aef2d52daa0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,7 +27,8 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_PMEM_API
+	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
@@ -41,6 +42,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
@@ -133,7 +135,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_UID16			if X86_32
+	select HAVE_UID16			if X86_32 || IA32_EMULATION
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select IRQ_FORCED_THREADING
@@ -1003,19 +1005,41 @@ config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
 
-config VM86
-	bool "Enable VM86 support" if EXPERT
-	default y
+config X86_LEGACY_VM86
+	bool "Legacy VM86 support (obsolete)"
+	default n
 	depends on X86_32
 	---help---
-	  This option is required by programs like DOSEMU to run
-	  16-bit real mode legacy code on x86 processors. It also may
-	  be needed by software like XFree86 to initialize some video
-	  cards via BIOS. Disabling this option saves about 6K.
+	  This option allows user programs to put the CPU into V8086
+	  mode, which is an 80286-era approximation of 16-bit real mode.
+
+	  Some very old versions of X and/or vbetool require this option
+	  for user mode setting.  Similarly, DOSEMU will use it if
+	  available to accelerate real mode DOS programs.  However, any
+	  recent version of DOSEMU, X, or vbetool should be fully
+	  functional even without kernel VM86 support, as they will all
+	  fall back to (pretty well performing) software emulation.
+
+	  Anything that works on a 64-bit kernel is unlikely to need
+	  this option, as 64-bit kernels don't, and can't, support V8086
+	  mode.  This option is also unrelated to 16-bit protected mode
+	  and is not needed to run most 16-bit programs under Wine.
+
+	  Enabling this option adds considerable attack surface to the
+	  kernel and slows down system calls and exception handling.
+
+	  Unless you use very old userspace or need the last drop of
+	  performance in your real mode DOS games and can't use KVM,
+	  say N here.
+
+config VM86
+       bool
+       default X86_LEGACY_VM86
 
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
 	default y
+	depends on MODIFY_LDT_SYSCALL
 	---help---
 	  This option is required by programs like Wine to run 16-bit
 	  protected mode legacy code on x86 processors.  Disabling
@@ -1427,10 +1451,14 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY_DEVICE
+	bool
+
 config X86_PMEM_LEGACY
-	bool "Support non-standard NVDIMMs and ADR protected memory"
+	tristate "Support non-standard NVDIMMs and ADR protected memory"
 	depends on PHYS_ADDR_T_64BIT
 	depends on BLK_DEV
+	select X86_PMEM_LEGACY_DEVICE
 	select LIBNVDIMM
 	help
 	  Treat memory marked using the non-standard e820 type of 12 as used
@@ -1510,6 +1538,7 @@ config X86_RESERVE_LOW
 
 config MATH_EMULATION
 	bool
+	depends on MODIFY_LDT_SYSCALL
 	prompt "Math emulation" if X86_32
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
@@ -1725,6 +1754,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
@@ -1741,8 +1771,8 @@ config KEXEC
 
 config KEXEC_FILE
 	bool "kexec file based system call"
+	select KEXEC_CORE
 	select BUILD_BIN2C
-	depends on KEXEC
 	depends on X86_64
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
@@ -2054,6 +2084,22 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+config MODIFY_LDT_SYSCALL
+	bool "Enable the LDT (local descriptor table)" if EXPERT
+	default y
+	---help---
+	  Linux can allow user programs to install a per-process x86
+	  Local Descriptor Table (LDT) using the modify_ldt(2) system
+	  call.  This is required to run 16-bit or segmented code such as
+	  DOSEMU or some Wine programs.  It is also used by some very old
+	  threading libraries.
+
+	  Enabling this feature adds a small amount of overhead to
+	  context switches and increases the low-level kernel attack
+	  surface.  Disabling it removes the modify_ldt(2) system call.
+
+	  Saying 'N' here may make sense for embedded or server kernels.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
@@ -2523,7 +2569,7 @@ config IA32_EMULATION
 	depends on X86_64
 	select BINFMT_ELF
 	select COMPAT_BINFMT_ELF
-	select HAVE_UID16
+	select ARCH_WANT_OLD_COMPAT_IPC
 	---help---
 	  Include code to run legacy 32-bit programs under a
 	  64-bit kernel. You should likely turn this on, unless you're
@@ -2537,7 +2583,7 @@ config IA32_AOUT
 
 config X86_X32
 	bool "x32 ABI for 64-bit mode"
-	depends on X86_64 && IA32_EMULATION
+	depends on X86_64
 	---help---
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  for 64-bit processors.  An x32 process gets access to the
@@ -2551,7 +2597,6 @@ config X86_X32
 config COMPAT
 	def_bool y
 	depends on IA32_EMULATION || X86_X32
-	select ARCH_WANT_OLD_COMPAT_IPC
 
 if COMPAT
 config COMPAT_FOR_U64_ALIGNMENT
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0f38418719ab..747860c696e1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
         LDFLAGS_vmlinux := --emit-relocs
 endif
 
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+#    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         UTS_MACHINE := i386
@@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
 KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 57bbf2fb21f6..0d553e54171b 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -23,7 +23,7 @@ targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
 setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
-setup-y		+= early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y		+= early_serial_console.o edd.o header.o main.o memory.o
 setup-y		+= pm.o pmjump.o printf.o regs.o string.o tty.o video.o
 setup-y		+= video-mode.o version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index bd49ec61255c..0033e96c3f09 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -307,9 +307,6 @@ void query_edd(void);
 /* header.S */
 void __attribute__((noreturn)) die(void);
 
-/* mca.c */
-int query_mca(void);
-
 /* memory.c */
 int detect_memory(void);
 
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d7b1f655b3ef..6a9b96b4624d 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		rdtscll(raw);
+		raw = rdtsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 7d69afd8b6fa..ee1b6d346b98 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1041,7 +1041,6 @@ void setup_graphics(struct boot_params *boot_params)
 struct boot_params *make_boot_params(struct efi_config *c)
 {
 	struct boot_params *boot_params;
-	struct sys_desc_table *sdt;
 	struct apm_bios_info *bi;
 	struct setup_header *hdr;
 	struct efi_info *efi;
@@ -1089,7 +1088,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	hdr = &boot_params->hdr;
 	efi = &boot_params->efi_info;
 	bi = &boot_params->apm_bios_info;
-	sdt = &boot_params->sys_desc_table;
 
 	/* Copy the second sector to boot_params */
 	memcpy(&hdr->jump, image->image_base + 512, 512);
@@ -1118,8 +1116,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	/* Clear APM BIOS info */
 	memset(bi, 0, sizeof(*bi));
 
-	memset(sdt, 0, sizeof(*sdt));
-
 	status = efi_parse_options(cmdline_ptr);
 	if (status != EFI_SUCCESS)
 		goto fail2;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a107b935e22f..79dac1758e7c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -220,6 +220,23 @@ void __putstr(const char *s)
 	outb(0xff & (pos >> 1), vidport+1);
 }
 
+void __puthex(unsigned long value)
+{
+	char alpha[2] = "0";
+	int bits;
+
+	for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
+		unsigned long digit = (value >> bits) & 0xf;
+
+		if (digit < 0xA)
+			alpha[0] = '0' + digit;
+		else
+			alpha[0] = 'a' + (digit - 0xA);
+
+		__putstr(alpha);
+	}
+}
+
 static void error(char *x)
 {
 	error_putstr("\n\n");
@@ -399,6 +416,13 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+	/* Report initial kernel position details. */
+	debug_putaddr(input_data);
+	debug_putaddr(input_len);
+	debug_putaddr(output);
+	debug_putaddr(output_len);
+	debug_putaddr(run_size);
+
 	/*
 	 * The memory hole needed for the kernel is the larger of either
 	 * the entire decompressed kernel plus relocation table, or the
@@ -424,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 #endif
 
 	debug_putstr("\nDecompressing Linux... ");
-	decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+	__decompress(input_data, input_len, NULL, NULL, output, output_len,
+			NULL, error);
 	parse_elf(output);
 	/*
 	 * 32-bit always performs relocations. 64-bit relocations are only
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 805d25ca5f1d..3783dc3e10b3 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -34,16 +34,27 @@ extern memptr free_mem_ptr;
 extern memptr free_mem_end_ptr;
 extern struct boot_params *real_mode;		/* Pointer to real-mode data */
 void __putstr(const char *s);
+void __puthex(unsigned long value);
 #define error_putstr(__x)  __putstr(__x)
+#define error_puthex(__x)  __puthex(__x)
 
 #ifdef CONFIG_X86_VERBOSE_BOOTUP
 
 #define debug_putstr(__x)  __putstr(__x)
+#define debug_puthex(__x)  __puthex(__x)
+#define debug_putaddr(__x) { \
+		debug_putstr(#__x ": 0x"); \
+		debug_puthex((unsigned long)(__x)); \
+		debug_putstr("\n"); \
+	}
 
 #else
 
 static inline void debug_putstr(const char *s)
 { }
+static inline void debug_puthex(const char *s)
+{ }
+#define debug_putaddr(x) /* */
 
 #endif
 
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index fd6c9f236996..9bcea386db65 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -161,9 +161,6 @@ void main(void)
 	/* Set keyboard repeat rate (why?) and query the lock flags */
 	keyboard_init();
 
-	/* Query MCA information */
-	query_mca();
-
 	/* Query Intel SpeedStep (IST) information */
 	query_ist();
 
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
deleted file mode 100644
index a95a531148ef..000000000000
--- a/arch/x86/boot/mca.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright (C) 1991, 1992 Linus Torvalds
- *   Copyright 2007 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * Get the MCA system description table
- */
-
-#include "boot.h"
-
-int query_mca(void)
-{
-	struct biosregs ireg, oreg;
-	u16 len;
-
-	initregs(&ireg);
-	ireg.ah = 0xc0;
-	intcall(0x15, &ireg, &oreg);
-
-	if (oreg.eflags & X86_EFLAGS_CF)
-		return -1;	/* No MCA present */
-
-	set_fs(oreg.es);
-	len = rdfs16(oreg.bx);
-
-	if (len > sizeof(boot_params.sys_desc_table))
-		len = sizeof(boot_params.sys_desc_table);
-
-	copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
-	return 0;
-}
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index aaa1118bf01e..028be48c8839 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -23,6 +23,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 315b86106572..cb5b3ab5beec 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -22,6 +22,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
@@ -207,7 +208,6 @@ CONFIG_AGP_AMD64=y
 CONFIG_AGP_INTEL=y
 CONFIG_DRM=y
 CONFIG_DRM_I915=y
-CONFIG_DRM_I915_KMS=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_FB_EFI=y
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 64d7cf1b50e1..440df0c7a2ee 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -294,6 +294,7 @@ static struct ahash_alg ghash_async_alg = {
 			.cra_name		= "ghash",
 			.cra_driver_name	= "ghash-clmulni",
 			.cra_priority		= 400,
+			.cra_ctxsize		= sizeof(struct ghash_async_ctx),
 			.cra_flags		= CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
 			.cra_blocksize		= GHASH_BLOCK_SIZE,
 			.cra_type		= &crypto_ahash_type,
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7a144971db79..bd55dedd7614 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the x86 low level entry code
 #
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4e6308c4200..3c71dd947c7b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
 	.endm
-	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq %rbp, 4*8+\offset(%rsp)
-	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
 	movq 0*8+\offset(%rsp), %r15
@@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
 	RESTORE_C_REGS_HELPER 1,0,0,1,1
 	.endm
-	.macro RESTORE_RSI_RDI
-	RESTORE_C_REGS_HELPER 0,0,0,0,0
-	.endm
-	.macro RESTORE_RSI_RDI_RDX
-	RESTORE_C_REGS_HELPER 0,0,0,0,1
-	.endm
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
 	subq $-(15*8+\addskip), %rsp
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000000..80dcc9261ca3
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,318 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible void enter_from_user_mode(void)
+{
+	CT_WARN_ON(ct_state() != CONTEXT_USER);
+	user_exit();
+}
+#endif
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+#ifdef CONFIG_CONTEXT_TRACKING
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		enter_from_user_mode();
+		work &= ~_TIF_NOHZ;
+	}
+#endif
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		return -1;
+	}
+#endif
+
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->orig_ax);
+
+	do_audit_syscall_entry(regs, arch);
+
+	return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+	unsigned long top_of_stack =
+		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
+/* Called with IRQs disabled. */
+__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+	if (WARN_ON(!irqs_disabled()))
+		local_irq_disable();
+
+	/*
+	 * In order to return to user mode, we need to have IRQs off with
+	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
+	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
+	 * can be set at any time on preemptable kernels if we have IRQs on,
+	 * so we need to loop.  Disabling preemption wouldn't help: doing the
+	 * work to clear some of the flags can sleep.
+	 */
+	while (true) {
+		u32 cached_flags =
+			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
+				      _TIF_UPROBE | _TIF_NEED_RESCHED |
+				      _TIF_USER_RETURN_NOTIFY)))
+			break;
+
+		/* We have work to do. */
+		local_irq_enable();
+
+		if (cached_flags & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (cached_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		/* deal with pending signal delivery */
+		if (cached_flags & _TIF_SIGPENDING)
+			do_signal(regs);
+
+		if (cached_flags & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(regs);
+		}
+
+		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+			fire_user_return_notifiers();
+
+		/* Disable IRQs and retry */
+		local_irq_disable();
+	}
+
+	user_enter();
+}
+
+/*
+ * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible void syscall_return_slowpath(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	u32 cached_flags = READ_ONCE(ti->flags);
+	bool step;
+
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+	if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
+		 regs->orig_ax))
+		local_irq_enable();
+
+	/*
+	 * First do one-time work.  If these work items are enabled, we
+	 * want to run them exactly once per syscall exit with IRQs on.
+	 */
+	if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
+			    _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
+		audit_syscall_exit(regs);
+
+		if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+			trace_sys_exit(regs, regs->ax);
+
+		/*
+		 * If TIF_SYSCALL_EMU is set, we only get here because of
+		 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+		 * We already reported this syscall instruction in
+		 * syscall_trace_enter().
+		 */
+		step = unlikely(
+			(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+			== _TIF_SINGLESTEP);
+		if (step || cached_flags & _TIF_SYSCALL_TRACE)
+			tracehook_report_syscall_exit(regs, step);
+	}
+
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
+	local_irq_disable();
+	prepare_exit_to_usermode(regs);
+}
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21dc60a60b5f..b2909bf8cf70 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -45,16 +45,6 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE		0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-# define sysenter_audit		syscall_trace_entry
-# define sysexit_audit		syscall_exit_work
-#endif
-
 	.section .entry.text, "ax"
 
 /*
@@ -266,14 +256,10 @@ ret_from_intr:
 
 ENTRY(resume_userspace)
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done on
-						# int/exception return?
-	jne	work_pending
+	movl	%esp, %eax
+	call	prepare_exit_to_usermode
 	jmp	restore_all
 END(ret_from_exception)
 
@@ -339,7 +325,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
-	jnz	sysenter_audit
+	jnz	syscall_trace_entry
 sysenter_do_call:
 	cmpl	$(NR_syscalls), %eax
 	jae	sysenter_badsys
@@ -351,7 +337,7 @@ sysenter_after_call:
 	TRACE_IRQS_OFF
 	movl	TI_flags(%ebp), %ecx
 	testl	$_TIF_ALLWORK_MASK, %ecx
-	jnz	sysexit_audit
+	jnz	syscall_exit_work_irqs_off
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl	PT_EIP(%esp), %edx
@@ -362,40 +348,6 @@ sysenter_exit:
 	PTGS_TO_GS
 	ENABLE_INTERRUPTS_SYSEXIT
 
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
-	jnz	syscall_trace_entry
-	/* movl	PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
-	movl	PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
-	/* movl	PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
-	pushl	PT_ESI(%esp)			/* a3: 5th arg */
-	pushl	PT_EDX+4(%esp)			/* a2: 4th arg */
-	call	__audit_syscall_entry
-	popl	%ecx				/* get that remapped edx off the stack */
-	popl	%ecx				/* get that remapped esi off the stack */
-	movl	PT_EAX(%esp), %eax		/* reload syscall number */
-	jmp	sysenter_do_call
-
-sysexit_audit:
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz	syscall_exit_work
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	movl	%eax, %edx			/* second arg, syscall return value */
-	cmpl	$-MAX_ERRNO, %eax		/* is it an error ? */
-	setbe %al				/* 1 if so, 0 if not */
-	movzbl %al, %eax			/* zero-extend that */
-	call	__audit_syscall_exit
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz	syscall_exit_work
-	movl	PT_EAX(%esp), %eax		/* reload syscall return value */
-	jmp	sysenter_exit
-#endif
-
 .pushsection .fixup, "ax"
 2:	movl	$0, PT_FS(%esp)
 	jmp	1b
@@ -421,13 +373,7 @@ syscall_after_call:
 	movl	%eax, PT_EAX(%esp)		# store the return value
 syscall_exit:
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	testl	$_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz	syscall_exit_work
+	jmp	syscall_exit_work
 
 restore_all:
 	TRACE_IRQS_IRET
@@ -504,57 +450,6 @@ ldt_ss:
 #endif
 ENDPROC(entry_INT80_32)
 
-	# perform work that needs to be done immediately before resumption
-	ALIGN
-work_pending:
-	testb	$_TIF_NEED_RESCHED, %cl
-	jz	work_notifysig
-work_resched:
-	call	schedule
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done other
-						# than syscall tracing?
-	jz	restore_all
-	testb	$_TIF_NEED_RESCHED, %cl
-	jnz	work_resched
-
-work_notifysig:					# deal with pending signals and
-						# notify-resume requests
-#ifdef CONFIG_VM86
-	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl	%esp, %eax
-	jnz	work_notifysig_v86		# returning to kernel-space or
-						# vm86-space
-1:
-#else
-	movl	%esp, %eax
-#endif
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb	PT_CS(%esp), %bl
-	andb	$SEGMENT_RPL_MASK, %bl
-	cmpb	$USER_RPL, %bl
-	jb	resume_kernel
-	xorl	%edx, %edx
-	call	do_notify_resume
-	jmp	resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl	%ecx				# save ti_flags for do_notify_resume
-	call	save_v86_state			# %eax contains pt_regs pointer
-	popl	%ecx
-	movl	%eax, %esp
-	jmp	1b
-#endif
-END(work_pending)
-
 	# perform syscall exit tracing
 	ALIGN
 syscall_trace_entry:
@@ -569,15 +464,14 @@ END(syscall_trace_entry)
 
 	# perform syscall exit tracing
 	ALIGN
-syscall_exit_work:
-	testl	$_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz	work_pending
+syscall_exit_work_irqs_off:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)		# could let syscall_trace_leave() call
-						# schedule() instead
+	ENABLE_INTERRUPTS(CLBR_ANY)
+
+syscall_exit_work:
 	movl	%esp, %eax
-	call	syscall_trace_leave
-	jmp	resume_userspace
+	call	syscall_return_slowpath
+	jmp	restore_all
 END(syscall_exit_work)
 
 syscall_fault:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8cb3e438f21e..d3033183ed70 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -33,7 +33,6 @@
 #include <asm/paravirt.h>
 #include <asm/percpu.h>
 #include <asm/asm.h>
-#include <asm/context_tracking.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <linux/err.h>
@@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath:
 	 */
 	USERGS_SYSRET64
 
+GLOBAL(int_ret_from_sys_call_irqs_off)
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	jmp int_ret_from_sys_call
+
 	/* Do syscall entry tracing */
 tracesys:
 	movq	%rsp, %rdi
@@ -272,69 +276,11 @@ tracesys_phase2:
  * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
-	TRACE_IRQS_OFF
-	movl	$_TIF_ALLWORK_MASK, %edi
-	/* edi:	mask to check */
-GLOBAL(int_with_check)
-	LOCKDEP_SYS_EXIT_IRQ
-	GET_THREAD_INFO(%rcx)
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	int_careful
-	andl	$~TS_COMPAT, TI_status(%rcx)
-	jmp	syscall_return
-
-	/*
-	 * Either reschedule or signal or syscall exit tracking needed.
-	 * First do a reschedule test.
-	 * edx:	work, edi: workmask
-	 */
-int_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	int_very_careful
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-	/* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
-	/* Check for syscall exit trace */
-	testl	$_TIF_WORK_SYSCALL_EXIT, %edx
-	jz	int_signal
-	pushq	%rdi
-	leaq	8(%rsp), %rdi			/* &ptregs -> arg1 */
-	call	syscall_trace_leave
-	popq	%rdi
-	andl	$~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
-	jmp	int_restore_rest
-
-int_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	1f
-	movq	%rsp, %rdi			/* &ptregs -> arg1 */
-	xorl	%esi, %esi			/* oldset -> arg2 */
-	call	do_notify_resume
-1:	movl	$_TIF_WORK_MASK, %edi
-int_restore_rest:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-syscall_return:
-	/* The IRETQ could re-enable interrupts: */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_IRETQ
+	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
@@ -555,23 +501,22 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	cld
-	/*
-	 * Since nothing in interrupt handling code touches r12...r15 members
-	 * of "struct pt_regs", and since interrupts can nest, we can save
-	 * four stack slots and simultaneously provide
-	 * an unwind-friendly stack layout by saving "truncated" pt_regs
-	 * exactly up to rbp slot, without these members.
-	 */
-	ALLOC_PT_GPREGS_ON_STACK -RBP
-	SAVE_C_REGS -RBP
-	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
-	SAVE_EXTRA_REGS_RBP -RBP
-
-	leaq	-RBP(%rsp), %rdi		/* arg1 for \func (pointer to pt_regs) */
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 
-	testb	$3, CS-RBP(%rsp)
+	testb	$3, CS(%rsp)
 	jz	1f
+
+	/*
+	 * IRQ from user mode.  Switch to kernel gsbase and inform context
+	 * tracking that we're in kernel mode.
+	 */
 	SWAPGS
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
 1:
 	/*
 	 * Save previous stack pointer, optionally switch to interrupt stack.
@@ -580,14 +525,14 @@ END(irq_entries_start)
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-	movq	%rsp, %rsi
+	movq	%rsp, %rdi
 	incl	PER_CPU_VAR(irq_count)
 	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rsi
+	pushq	%rdi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
-	call	\func
+	call	\func	/* rdi points to pt_regs */
 	.endm
 
 	/*
@@ -606,34 +551,19 @@ ret_from_intr:
 	decl	PER_CPU_VAR(irq_count)
 
 	/* Restore saved previous stack */
-	popq	%rsi
-	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq	-RBP(%rsi), %rsp
+	popq	%rsp
 
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
-	/* Interrupt came from user space */
-retint_user:
-	GET_THREAD_INFO(%rcx)
 
-	/* %rcx: thread info. Interrupts are off. */
-retint_with_reschedule:
-	movl	$_TIF_WORK_MASK, %edi
-retint_check:
+	/* Interrupt came from user space */
 	LOCKDEP_SYS_EXIT_IRQ
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	retint_careful
-
-retint_swapgs:					/* return to user-space */
-	/*
-	 * The iretq could re-enable interrupts:
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
+GLOBAL(retint_user)
+	mov	%rsp,%rdi
+	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
-
 	SWAPGS
-	jmp	restore_c_regs_and_iret
+	jmp	restore_regs_and_iret
 
 /* Returning to kernel space */
 retint_kernel:
@@ -657,6 +587,8 @@ retint_kernel:
  * At this label, code paths which return to kernel and to user,
  * which come from interrupts/exception and from syscalls, merge.
  */
+restore_regs_and_iret:
+	RESTORE_EXTRA_REGS
 restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
@@ -707,37 +639,6 @@ native_irq_return_ldt:
 	popq	%rax
 	jmp	native_irq_return_iret
 #endif
-
-	/* edi: workmask, edx: work */
-retint_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	retint_signal
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	GET_THREAD_INFO(%rcx)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	retint_check
-
-retint_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	retint_swapgs
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_EXTRA_REGS
-	movq	$-1, ORIG_RAX(%rsp)
-	xorl	%esi, %esi			/* oldset */
-	movq	%rsp, %rdi			/* &pt_regs */
-	call	do_notify_resume
-	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	jmp	retint_with_reschedule
-
 END(common_interrupt)
 
 /*
@@ -1143,12 +1044,22 @@ ENTRY(error_entry)
 	SAVE_EXTRA_REGS 8
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
-	jz	error_kernelspace
+	jz	.Lerror_kernelspace
 
-	/* We entered from user mode */
+.Lerror_entry_from_usermode_swapgs:
+	/*
+	 * We entered from user mode or we're pretending to have entered
+	 * from user mode due to an IRET fault.
+	 */
 	SWAPGS
 
-error_entry_done:
+.Lerror_entry_from_usermode_after_swapgs:
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
+.Lerror_entry_done:
+
 	TRACE_IRQS_OFF
 	ret
 
@@ -1158,31 +1069,30 @@ error_entry_done:
 	 * truncated RIP for IRET exceptions returning to compat mode. Check
 	 * for these here too.
 	 */
-error_kernelspace:
+.Lerror_kernelspace:
 	incl	%ebx
 	leaq	native_irq_return_iret(%rip), %rcx
 	cmpq	%rcx, RIP+8(%rsp)
-	je	error_bad_iret
+	je	.Lerror_bad_iret
 	movl	%ecx, %eax			/* zero extend */
 	cmpq	%rax, RIP+8(%rsp)
-	je	bstep_iret
+	je	.Lbstep_iret
 	cmpq	$gs_change, RIP+8(%rsp)
-	jne	error_entry_done
+	jne	.Lerror_entry_done
 
 	/*
 	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * gs_change's error handler with kernel gsbase.
 	 */
-	SWAPGS
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_swapgs
 
-bstep_iret:
+.Lbstep_iret:
 	/* Fix truncated RIP */
 	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 
-error_bad_iret:
+.Lerror_bad_iret:
 	/*
 	 * We came from an IRET to user mode, so we have user gsbase.
 	 * Switch to kernel gsbase:
@@ -1198,7 +1108,7 @@ error_bad_iret:
 	call	fixup_bad_iret
 	mov	%rax, %rsp
 	decl	%ebx
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_after_swapgs
 END(error_entry)
 
 
@@ -1209,7 +1119,6 @@ END(error_entry)
  */
 ENTRY(error_exit)
 	movl	%ebx, %eax
-	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl	%eax, %eax
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index a7e257d9cb90..a9360d40fb7f 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -22,8 +22,8 @@
 #define __AUDIT_ARCH_LE		0x40000000
 
 #ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit		ia32_ret_from_sys_call
-# define sysretl_audit		ia32_ret_from_sys_call
+# define sysexit_audit		ia32_ret_from_sys_call_irqs_off
+# define sysretl_audit		ia32_ret_from_sys_call_irqs_off
 #endif
 
 	.section .entry.text, "ax"
@@ -141,7 +141,8 @@ sysexit_from_sys_call:
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	movl	RIP(%rsp), %ecx		/* User %eip */
 	movq    RAX(%rsp), %rax
-	RESTORE_RSI_RDI
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	xorl	%edx, %edx		/* Do not leak kernel information */
 	xorq	%r8, %r8
 	xorq	%r9, %r9
@@ -209,10 +210,10 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_ret_from_sys_call
 	movl	%eax, %esi		/* second arg, syscall return value */
 	cmpl	$-MAX_ERRNO, %eax	/* is it an error ? */
 	jbe	1f
@@ -230,7 +231,7 @@ sysexit_from_sys_call:
 	movq	%rax, R10(%rsp)
 	movq	%rax, R9(%rsp)
 	movq	%rax, R8(%rsp)
-	jmp	int_with_check
+	jmp	int_ret_from_sys_call_irqs_off
 	.endm
 
 sysenter_auditsys:
@@ -365,7 +366,9 @@ cstar_dispatch:
 
 sysretl_from_sys_call:
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	RESTORE_RSI_RDI_RDX
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	movl	RIP(%rsp), %ecx
 	movl	EFLAGS(%rsp), %r11d
 	movq    RAX(%rsp), %rax
@@ -430,8 +433,48 @@ cstar_tracesys:
 END(entry_SYSCALL_compat)
 
 ia32_badarg:
-	ASM_CLAC
-	movq	$-EFAULT, RAX(%rsp)
+	/*
+	 * So far, we've entered kernel mode, set AC, turned on IRQs, and
+	 * saved C regs except r8-r11.  We haven't done any of the other
+	 * standard entry work, though.  We want to bail, but we shouldn't
+	 * treat this as a syscall entry since we don't even know what the
+	 * args are.  Instead, treat this as a non-syscall entry, finish
+	 * the entry work, and immediately exit after setting AX = -EFAULT.
+	 *
+	 * We're really just being polite here.  Killing the task outright
+	 * would be a reasonable action, too.  Given that the only valid
+	 * way to have gotten here is through the vDSO, and we already know
+	 * that the stack pointer is bad, the task isn't going to survive
+	 * for long no matter what we do.
+	 */
+
+	ASM_CLAC			/* undo STAC */
+	movq	$-EFAULT, RAX(%rsp)	/* return -EFAULT if possible */
+
+	/* Fill in the rest of pt_regs */
+	xorl	%eax, %eax
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	SAVE_EXTRA_REGS
+
+	/* Turn IRQs back off. */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+
+	/* Now finish entering normal kernel mode. */
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
+	/* And exit again. */
+	jmp retint_user
+
+ia32_ret_from_sys_call_irqs_off:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 ia32_ret_from_sys_call:
 	xorl	%eax, %eax		/* Do not leak kernel information */
 	movq	%rax, R11(%rsp)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..7663c455b9f6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -365,3 +365,20 @@
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
 358	i386	execveat		sys_execveat			stub32_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..278842fdf1f6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,8 @@
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
+323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e97032069f88..a3d0767a6b29 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
-VDSO32-$(CONFIG_COMPAT)		:= y
+VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -20,7 +20,7 @@ obj-y				+= vma.o
 vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSOX32-y)		+= x32
 vdso_img-$(VDSO32-y)		+= 32-int80
-vdso_img-$(CONFIG_COMPAT)	+= 32-syscall
+vdso_img-$(CONFIG_IA32_EMULATION)	+= 32-syscall
 vdso_img-$(VDSO32-y)		+= 32-sysenter
 
 obj-$(VDSO32-y)			+= vdso32-setup.o
@@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 vdso32.so-$(VDSO32-y)		+= int80
-vdso32.so-$(CONFIG_COMPAT)	+= syscall
+vdso32.so-$(CONFIG_IA32_EMULATION)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
 
 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
@@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO    $@
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
 		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
 	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
 GCOV_PROFILE := n
 
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322751e0..ca94fa649251 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 notrace static cycle_t vread_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)__native_read_tsc();
-
-	last = gtod->cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = gtod->cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750c3859..434543145d78 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -177,7 +177,7 @@ up_fail:
 	return ret;
 }
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static int load_vdso32(void)
 {
 	int ret;
@@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 		return map_vdso(&vdso_image_x32, true);
 	}
 #endif
-
+#ifdef CONFIG_IA32_EMULATION
 	return load_vdso32();
+#else
+	return 0;
+#endif
 }
 #endif
 #else
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6fdcc..b160c0c6baed 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
 {
 	return "[vsyscall]";
 }
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
 	.name = gate_vma_name,
 };
 static struct vm_area_struct gate_vma = {
@@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
 
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
-#ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_COMPAT
 	if (!mm || mm->context.ia32_compat)
 		return NULL;
 #endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ae3a29ae875b..a0a19b7ba22d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,99 +34,6 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
-	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
-
-	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	put_user_try {
-		/* If you change siginfo_t structure, please make sure that
-		   this code is fixed accordingly.
-		   It should never copy any pad contained in the structure
-		   to avoid security leaks, but must copy the generic
-		   3 ints plus the relevant union member.  */
-		put_user_ex(from->si_signo, &to->si_signo);
-		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
-
-		if (from->si_code < 0) {
-			put_user_ex(from->si_pid, &to->si_pid);
-			put_user_ex(from->si_uid, &to->si_uid);
-			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
-		} else {
-			/*
-			 * First 32bits of unions are always present:
-			 * si_pid === si_band === si_tid === si_addr(LS half)
-			 */
-			put_user_ex(from->_sifields._pad[0],
-					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
-				break;
-			case __SI_SYS >> 16:
-				put_user_ex(from->si_syscall, &to->si_syscall);
-				put_user_ex(from->si_arch, &to->si_arch);
-				break;
-			case __SI_CHLD >> 16:
-				if (ia32) {
-					put_user_ex(from->si_utime, &to->si_utime);
-					put_user_ex(from->si_stime, &to->si_stime);
-				} else {
-					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
-					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
-				}
-				put_user_ex(from->si_status, &to->si_status);
-				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				break;
-			case __SI_POLL >> 16:
-				put_user_ex(from->si_fd, &to->si_fd);
-				break;
-			case __SI_TIMER >> 16:
-				put_user_ex(from->si_overrun, &to->si_overrun);
-				put_user_ex(ptr_to_compat(from->si_ptr),
-					    &to->si_ptr);
-				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				put_user_ex(from->si_int, &to->si_int);
-				break;
-			}
-		}
-	} put_user_catch(err);
-
-	return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
-	int err = 0;
-	u32 ptr32;
-
-	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	get_user_try {
-		get_user_ex(to->si_signo, &from->si_signo);
-		get_user_ex(to->si_errno, &from->si_errno);
-		get_user_ex(to->si_code, &from->si_code);
-
-		get_user_ex(to->si_pid, &from->si_pid);
-		get_user_ex(to->si_uid, &from->si_uid);
-		get_user_ex(ptr32, &from->si_ptr);
-		to->si_ptr = compat_ptr(ptr32);
-	} get_user_catch(err);
-
-	return err;
-}
-
 /*
  * Do a signal return; undo the signal stack.
  */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c8393634ca0c..ebf6d5e5668c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,6 @@ struct apic {
 	/* wakeup_secondary_cpu */
 	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
-	bool wait_for_init_deassert;
 	void (*inquire_remote_apic)(int apicid);
 
 	/* apic ops */
@@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
  * APIC functionality to boot other CPUs - only used on SMP:
  */
 #ifdef CONFIG_SMP
-extern atomic_t init_deasserted;
 extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index e9168955c42f..fb52aa644aab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	asm volatile(LOCK_PREFIX #op"l %1,%0"				\
+			: "+m" (v->counter)				\
+			: "ir" (i)					\
+			: "memory");					\
+}
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
 /**
  * __atomic_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
@@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v)
 	return *v;
 }
 
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr)				\
-	asm volatile(LOCK_PREFIX "andl %0,%1"			\
-		     : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr)				\
-	asm volatile(LOCK_PREFIX "orl %0,%1"			\
-		     : : "r" ((unsigned)(mask)), "m" (*(addr))	\
-		     : "memory")
-
 #ifdef CONFIG_X86_32
 # include <asm/atomic64_32.h>
 #else
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index b154de75c90c..a11c30b77fb5 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
+#define ATOMIC64_OP(op, c_op)						\
+static inline void atomic64_##op(long long i, atomic64_t *v)		\
+{									\
+	long long old, c = 0;						\
+	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
+		c = old;						\
+}
+
+ATOMIC64_OP(and, &)
+ATOMIC64_OP(or, |)
+ATOMIC64_OP(xor, ^)
+
+#undef ATOMIC64_OP
+
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index b965f9e03f2a..50e33eff58de 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
+#define ATOMIC64_OP(op)							\
+static inline void atomic64_##op(long i, atomic64_t *v)			\
+{									\
+	asm volatile(LOCK_PREFIX #op"q %1,%0"				\
+			: "+m" (v->counter)				\
+			: "er" (i)					\
+			: "memory");					\
+}
+
+ATOMIC64_OP(and)
+ATOMIC64_OP(or)
+ATOMIC64_OP(xor)
+
+#undef ATOMIC64_OP
+
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index e51a8f803f55..0681d2532527 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -57,12 +57,12 @@
 do {									\
 	compiletime_assert_atomic_type(*p);				\
 	smp_mb();							\
-	ACCESS_ONCE(*p) = (v);						\
+	WRITE_ONCE(*p, v);						\
 } while (0)
 
 #define smp_load_acquire(p)						\
 ({									\
-	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
 	smp_mb();							\
 	___p1;								\
@@ -74,12 +74,12 @@ do {									\
 do {									\
 	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
-	ACCESS_ONCE(*p) = (v);						\
+	WRITE_ONCE(*p, v);						\
 } while (0)
 
 #define smp_load_acquire(p)						\
 ({									\
-	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	___p1;								\
@@ -91,15 +91,4 @@ do {									\
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- */
-static __always_inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9bf3ea14b9f0..e63aa38e85fb 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;
@@ -109,75 +111,4 @@ static inline int rodata_test(void)
 }
 #endif
 
-#ifdef ARCH_HAS_NOCACHE_UACCESS
-
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
- */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-		size_t n)
-{
-	int unwritten;
-
-	/*
-	 * We are copying between two kernel buffers, if
-	 * __copy_from_user_inatomic_nocache() returns an error (page
-	 * fault) we would have already reported a general protection fault
-	 * before the WARN+BUG.
-	 */
-	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
-			(void __user *) src, n);
-	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
-				__func__, dst, src, unwritten))
-		BUG();
-}
-
-/**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
-	/*
-	 * wmb() to 'sfence' all previous writes such that they are
-	 * architecturally visible to 'pcommit'.  Note, that we've
-	 * already arranged for pmem writes to avoid the cache via
-	 * arch_memcpy_to_pmem().
-	 */
-	wmb();
-	pcommit_sfence();
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-#ifdef CONFIG_X86_64
-	/*
-	 * We require that wmb() be an 'sfence', that is only guaranteed on
-	 * 64-bit builds
-	 */
-	return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
-	return false;
-#endif
-}
-#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
-extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
-extern void arch_wmb_pmem(void);
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-	return false;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
deleted file mode 100644
index 1fe49704b146..000000000000
--- a/arch/x86/include/asm/context_tracking.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_CONTEXT_TRACKING_H
-#define _ASM_X86_CONTEXT_TRACKING_H
-
-#ifdef CONFIG_CONTEXT_TRACKING
-# define SCHEDULE_USER call schedule_user
-#else
-# define SCHEDULE_USER call schedule
-#endif
-
-#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606fb97d0..477fc28050e4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -119,6 +119,7 @@
 #define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
 #define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_CID		( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
 #define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
 #define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
 #define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
@@ -176,6 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2754c7..36a760bda462 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
 #include <asm-generic/delay.h>
 
 void use_tsc_delay(void);
+void use_mwaitx_delay(void);
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1f5b7287d1ad..953b7263f844 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	debug_dma_mapping_error(dev, dma_addr);
-	if (ops->mapping_error)
-		return ops->mapping_error(dev, dma_addr);
-
-	return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
-		    void *vaddr, dma_addr_t bus,
-		    struct dma_attrs *attrs);
-
 #endif
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f161c189c27b..141c561f4664 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 #ifdef CONFIG_X86_64
 extern unsigned int vdso64_enabled;
 #endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 extern unsigned int vdso32_enabled;
 #endif
 
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 	elf_common_init(&current->thread, regs, __USER_DS)
 
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
 
 void set_personality_ia32(bool);
 #define COMPAT_SET_PERSONALITY(ex)			\
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
  */
 static inline int mmap_is_ia32(void)
 {
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_ADDR32))
-		return 1;
-#endif
-	return 0;
+	return config_enabled(CONFIG_X86_32) ||
+	       (config_enabled(CONFIG_COMPAT) &&
+		test_thread_flag(TIF_ADDR32));
 }
 
 /* Do not change the values. See get_align_mask() */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f45acad3c4b6..24938852db30 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -3,9 +3,9 @@
 
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CC_USING_FENTRY
-# define MCOUNT_ADDR		((long)(__fentry__))
+# define MCOUNT_ADDR		((unsigned long)(__fentry__))
 #else
-# define MCOUNT_ADDR		((long)(mcount))
+# define MCOUNT_ADDR		((unsigned long)(mcount))
 #endif
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6615032e19c8..1e3408e88604 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -182,10 +182,10 @@ extern char irq_entries_start[];
 #define trace_irq_entries_start irq_entries_start
 #endif
 
-#define VECTOR_UNDEFINED	(-1)
-#define VECTOR_RETRIGGERED	(-2)
+#define VECTOR_UNUSED		NULL
+#define VECTOR_RETRIGGERED	((void *)~0UL)
 
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index d0e8e0141041..28019765442e 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -22,15 +22,6 @@ struct ucontext_ia32 {
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
-struct ucontext_x32 {
-	unsigned int	  uc_flags;
-	unsigned int 	  uc_link;
-	compat_stack_t	  uc_stack;
-	unsigned int	  uc__pad0;     /* needed for alignment */
-	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
-	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
-};
-
 /* This matches struct stat64 in glibc2.2, hence the absolutely
  * insane amounts of padding around dev_t's.
  */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index cc9c61bc1abe..de25aad07853 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -180,6 +180,8 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
@@ -248,12 +250,6 @@ static inline void flush_write_buffers(void)
 #endif
 }
 
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
-	unsigned long size)
-{
-	return (void __force __pmem *) ioremap_cache(offset, size);
-}
-
 #endif /* __KERNEL__ */
 
 extern void native_io_delay(void);
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index 57995f0596a6..b72ad0faa6c5 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -52,20 +52,20 @@
 
 /* Quark available units */
 #define QRK_MBI_UNIT_HBA	0x00
-#define QRK_MBI_UNIT_HB	0x03
+#define QRK_MBI_UNIT_HB		0x03
 #define QRK_MBI_UNIT_RMU	0x04
-#define QRK_MBI_UNIT_MM	0x05
+#define QRK_MBI_UNIT_MM		0x05
 #define QRK_MBI_UNIT_MMESRAM	0x05
 #define QRK_MBI_UNIT_SOC	0x31
 
 /* Quark read/write opcodes */
 #define QRK_MBI_HBA_READ	0x10
 #define QRK_MBI_HBA_WRITE	0x11
-#define QRK_MBI_HB_READ	0x10
+#define QRK_MBI_HB_READ		0x10
 #define QRK_MBI_HB_WRITE	0x11
 #define QRK_MBI_RMU_READ	0x10
 #define QRK_MBI_RMU_WRITE	0x11
-#define QRK_MBI_MM_READ	0x10
+#define QRK_MBI_MM_READ		0x10
 #define QRK_MBI_MM_WRITE	0x11
 #define QRK_MBI_MMESRAM_READ	0x12
 #define QRK_MBI_MMESRAM_WRITE	0x13
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 8008d06581c7..881b4768644a 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
 
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+struct irq_desc;
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4c2d2eb2060a..6ca9fd6234e1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,16 +117,6 @@
 
 #define FPU_IRQ				  13
 
-#define	FIRST_VM86_IRQ			   3
-#define LAST_VM86_IRQ			  15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
-	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
-#endif
-
 /*
  * Size the maximum number of interrupts.
  *
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a4c1cf7e93f8..5daeca3d0f9e 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -16,15 +16,32 @@
 # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
 #endif
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
 		".pushsection __jump_table,  \"aw\" \n\t"
 		_ASM_ALIGN "\n\t"
-		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
 		".popsection \n\t"
-		: :  "i" (key) : : l_yes);
+		: :  "i" (key), "i" (branch) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:"
+		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+		"2:\n\t"
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+		".popsection \n\t"
+		: :  "i" (key), "i" (branch) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f6266f425..0d9b14f60d2c 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_MATH_EMU_H
 
 #include <asm/ptrace.h>
-#include <asm/vm86.h>
 
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
    */
 struct math_emu_info {
 	long ___orig_eip;
-	union {
-		struct pt_regs *regs;
-		struct kernel_vm86_regs *vm86;
-	};
+	struct pt_regs *regs;
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 364d27481a52..55234d5e7160 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
  * we put the segment information here.
  */
 typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
+#endif
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 984abfe47edc..379cd3658799 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 /*
  * ldt_structs can be allocated, used, and freed, but they are never
  * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
 	int size;
 };
 
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else	/* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
 static inline void load_mm_ldt(struct mm_struct *mm)
 {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
 	/* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 		set_ldt(ldt->entries, ldt->size);
 	else
 		clear_LDT();
+#else
+	clear_LDT();
+#endif
 
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
@@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Load per-mm CR4 state */
 		load_mm_cr4(next);
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
+#endif
 	}
 #ifdef CONFIG_SMP
 	  else {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fcd17c1fc0c6..c1c0a1c14344 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -184,6 +184,12 @@
 #define MSR_PP1_ENERGY_STATUS		0x00000641
 #define MSR_PP1_POLICY			0x00000642
 
+#define MSR_CONFIG_TDP_NOMINAL		0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1		0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2		0x0000064A
+#define MSR_CONFIG_TDP_CONTROL		0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO	0x0000064C
+
 #define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658
 #define MSR_PKG_ANY_CORE_C0_RES		0x00000659
 #define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e6a707eb5081..77d8b284e4a7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
  * it means rax *or* rdx.
  */
 #ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high)	unsigned low, high
-#define EAX_EDX_VAL(val, low, high)	((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high)	"a" (low), "d" (high)
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
 #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
 #else
 #define DECLARE_ARGS(val, low, high)	unsigned long long val
 #define EAX_EDX_VAL(val, low, high)	(val)
-#define EAX_EDX_ARGS(val, low, high)	"A" (val)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
@@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 	return err;
 }
 
-extern unsigned long long native_read_tsc(void);
-
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static __always_inline unsigned long long __native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect.  The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
@@ -120,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void)
 	return EAX_EDX_VAL(val, low, high);
 }
 
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter.  It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory
+	 * access.  The Intel SDM and the AMD APM are both vague on this
+	 * point, but empirically an RDTSC instruction can be
+	 * speculatively executed before prior loads.  An RDTSC
+	 * immediately after an appropriate barrier appears to be
+	 * ordered as a normal load, that is, it provides the same
+	 * ordering guarantees as reading from a global memory location
+	 * that some other imaginary CPU is updating continuously with a
+	 * time stamp.
+	 */
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	return rdtsc();
+}
+
+/* Deprecated, keep it for a cycle for easier merging: */
+#define rdtscll(now)	do { (now) = rdtsc_ordered(); } while (0)
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
@@ -153,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
 #define rdmsrl(msr, val)			\
 	((val) = native_read_msr((msr)))
 
-#define wrmsrl(msr, val)						\
-	native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	native_write_msr(msr, (u32)val, (u32)(val >> 32));
+}
 
 /* wrmsr with exception handling */
 static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
@@ -180,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-#define rdtscl(low)						\
-	((low) = (u32)__native_read_tsc())
-
-#define rdtscll(val)						\
-	((val) = __native_read_tsc())
-
 #define rdpmc(counter, low, high)			\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
@@ -195,15 +226,6 @@ do {							\
 
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 
-#define rdtscp(low, high, aux)					\
-do {                                                            \
-	unsigned long long _val = native_read_tscp(&(aux));     \
-	(low) = (u32)_val;                                      \
-	(high) = (u32)(_val >> 32);                             \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
 #endif	/* !CONFIG_PARAVIRT */
 
 /*
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 653dfa7662e1..c70689b5e5aa 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
 #define CPUID5_ECX_INTERRUPT_BREAK	0x2
 
 #define MWAIT_ECX_INTERRUPT_BREAK	0x1
+#define MWAITX_ECX_TIMER_ENABLE		BIT(1)
+#define MWAITX_MAX_LOOPS		((u32)-1)
+#define MWAITX_DISABLE_CSTATES		0xf
 
 static inline void __monitor(const void *eax, unsigned long ecx,
 			     unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,
 		     :: "a" (eax), "c" (ecx), "d"(edx));
 }
 
+static inline void __monitorx(const void *eax, unsigned long ecx,
+			      unsigned long edx)
+{
+	/* "monitorx %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfa;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
 	/* "mwait %eax, %ecx;" */
@@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ *                 MWAIT                           MWAITX
+ * opcode          0f 01 c9           |            0f 01 fb
+ * ECX[0]                  value of RFLAGS.IF seen by instruction
+ * ECX[1]          unused/#GP if set  |            enable timer if set
+ * ECX[31:2]                     unused/#GP if set
+ * EAX                           unused (reserve for hint)
+ * EBX[31:0]       unused             |            max wait time (P0 clocks)
+ *
+ *                 MONITOR                         MONITORX
+ * opcode          0f 01 c8           |            0f 01 fa
+ * EAX                     (logical) address to monitor
+ * ECX                     #GP if not zero
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+			    unsigned long ecx)
+{
+	/* "mwaitx %eax, %ebx, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfb;"
+		     :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
 	trace_hardirqs_on();
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d143bfad45d7..10d0596433f8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -153,7 +153,11 @@ do {						\
 	val = paravirt_read_msr(msr, &_err);	\
 } while (0)
 
-#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
 #define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
 
 /* rdmsr with exception handling */
@@ -174,19 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-static inline u64 paravirt_read_tsc(void)
-{
-	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low)				\
-do {						\
-	u64 _l = paravirt_read_tsc();		\
-	low = (int)_l;				\
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
 static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +206,6 @@ do {						\
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
-	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux)				\
-do {							\
-	int __aux;					\
-	unsigned long __val = paravirt_rdtscp(&__aux);	\
-	(low) = (u32)__val;				\
-	(high) = (u32)(__val >> 32);			\
-	(aux) = __aux;					\
-} while (0)
-
-#define rdtscpll(val, aux)				\
-do {							\
-	unsigned long __aux; 				\
-	val = paravirt_rdtscp(&__aux);			\
-	(aux) = __aux;					\
-} while (0)
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a6b8f9fadb06..ce029e4fa7c6 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,9 +156,7 @@ struct pv_cpu_ops {
 	u64 (*read_msr)(unsigned int msr, int *err);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 
-	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
-	unsigned long long (*read_tscp)(unsigned int *aux);
 
 #ifdef CONFIG_X86_32
 	/*
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index bc0fc0866553..aa8744c77c6d 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -18,6 +18,8 @@
 
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
+/* CherryTrail Power Control Unit PCI Device ID */
+#define	PCI_DEVICE_ID_CHT_PMC	0x229C
 
 /* PMC Memory mapped IO registers */
 #define	PMC_BASE_ADDR_OFFSET	0x44
@@ -29,6 +31,10 @@
 #define	PMC_FUNC_DIS		0x34
 #define	PMC_FUNC_DIS_2		0x38
 
+/* CHT specific bits in FUNC_DIS2 register */
+#define	BIT_FD_GMM		BIT(3)
+#define	BIT_FD_ISH		BIT(4)
+
 /* S0ix wake event control */
 #define	PMC_S0IX_WAKE_EN	0x3C
 
@@ -75,6 +81,21 @@
 #define PMC_PSS_BIT_USB			BIT(16)
 #define PMC_PSS_BIT_USB_SUS		BIT(17)
 
+/* CHT specific bits in PSS register */
+#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
+#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
+#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
+#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
+#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
+#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
+#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
+#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
+
 /* These registers reflect D3 status of functions */
 #define	PMC_D3_STS_0		0xA0
 
@@ -117,6 +138,10 @@
 #define	BIT_USH_SS_PHY		BIT(2)
 #define	BIT_DFX			BIT(3)
 
+/* CHT specific bits in PMC_D3_STS_1 register */
+#define	BIT_STS_GMM		BIT(1)
+#define	BIT_STS_ISH		BIT(2)
+
 /* PMC I/O Registers */
 #define	ACPI_BASE_ADDR_OFFSET	0x40
 #define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
@@ -126,4 +151,8 @@
 #define	SLEEP_TYPE_MASK		0xFFFFECFF
 #define	SLEEP_TYPE_S5		0x1C00
 #define	SLEEP_ENABLE		0x2000
+
+extern int pmc_atom_read(int offset, u32 *value);
+extern int pmc_atom_write(int offset, u32 value);
+
 #endif /* PMC_ATOM_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
new file mode 100644
index 000000000000..d8ce3ec816ab
--- /dev/null
+++ b/arch/x86/include/asm/pmem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t n)
+{
+	int unwritten;
+
+	/*
+	 * We are copying between two kernel buffers, if
+	 * __copy_from_user_inatomic_nocache() returns an error (page
+	 * fault) we would have already reported a general protection fault
+	 * before the WARN+BUG.
+	 */
+	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+			(void __user *) src, n);
+	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+				__func__, dst, src, unwritten))
+		BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+	/*
+	 * wmb() to 'sfence' all previous writes such that they are
+	 * architecturally visible to 'pcommit'.  Note, that we've
+	 * already arranged for pmem writes to avoid the cache via
+	 * arch_memcpy_to_pmem().
+	 */
+	wmb();
+	pcommit_sfence();
+}
+
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction.  This function requires explicit ordering with an
+ * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
+
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += x86_clflush_size)
+		clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+	return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:	PMEM destination address
+ * @bytes:	number of bytes to copy
+ * @i:		iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+		struct iov_iter *i)
+{
+	void *vaddr = (void __force *)addr;
+	size_t len;
+
+	/* TODO: skip the write-back by always using non-temporal stores */
+	len = copy_from_iter_nocache(vaddr, bytes, i);
+
+	if (__iter_needs_pmem_wb(i))
+		__arch_wb_cache_pmem(vaddr, bytes);
+
+	return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+	void *vaddr = (void __force *)addr;
+
+	/* TODO: implement the zeroing via non-temporal writes */
+	if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+		clear_page(vaddr);
+	else
+		memset(vaddr, 0, size);
+
+	__arch_wb_cache_pmem(vaddr, size);
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+	/*
+	 * We require that wmb() be an 'sfence', that is only guaranteed on
+	 * 64-bit builds
+	 */
+	return static_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+#endif /* __ASM_X86_PMEM_H__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 944f1785ed0d..19577dd325fa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -6,8 +6,8 @@
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
+struct vm86;
 
-#include <asm/vm86.h>
 #include <asm/math_emu.h>
 #include <asm/segment.h>
 #include <asm/types.h>
@@ -400,15 +400,9 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
 	/* Virtual 86 mode info */
-	struct vm86_struct __user *vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags;
-	unsigned long		v86mask;
-	unsigned long		saved_sp0;
-	unsigned int		saved_fs;
-	unsigned int		saved_gs;
+	struct vm86		*vm86;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
@@ -651,14 +645,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 
 extern void set_task_blockstep(struct task_struct *task, bool on);
 
-/*
- * from system description table in BIOS. Mostly for MCA use, but
- * others may find it useful:
- */
-extern unsigned int		machine_id;
-extern unsigned int		machine_submodel_id;
-extern unsigned int		BIOS_revision;
-
 /* Boot loader type from the setup header: */
 extern int			bootloader_type;
 extern int			bootloader_version;
@@ -720,7 +706,6 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
-	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 }
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5fabf1362942..6271281f947d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
 				       unsigned long phase1_result);
 
 extern long syscall_trace_enter(struct pt_regs *);
-extern void syscall_trace_leave(struct pt_regs *);
 
 static inline unsigned long regs_return_value(struct pt_regs *regs)
 {
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 628954ceede1..7a6bed5c08bc 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = __native_read_tsc() - src->tsc_timestamp;
+	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
@@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 	u8 ret_flags;
 
 	version = src->version;
-	/* Note: emulated platforms which do not advertise SSE2 support
-	 * result in kvmclock not using the necessary RDTSC barriers.
-	 * Without barriers, it is possible that RDTSC instruction reads from
-	 * the time stamp counter outside rdtsc_barrier protected section
-	 * below, resulting in violation of monotonicity.
-	 */
-	rdtsc_barrier();
+
 	offset = pvclock_get_nsec_offset(src);
 	ret = src->system_time + offset;
 	ret_flags = src->flags;
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index ae0e241e228b..c537cbb038a7 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -2,16 +2,6 @@
 #define _ASM_X86_QRWLOCK_H
 
 #include <asm-generic/qrwlock_types.h>
-
-#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
-{
-        barrier();
-        ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
-}
-#endif
-
 #include <asm-generic/qrwlock.h>
 
 #endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 9d51fae1cba3..eaba08076030 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -39,18 +39,27 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
 }
 #endif
 
-#define virt_queued_spin_lock virt_queued_spin_lock
-
-static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifdef CONFIG_PARAVIRT
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
 {
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
 		return false;
 
-	while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
-		cpu_relax();
+	/*
+	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+	 * back to a Test-and-Set spinlock, because fair locks have
+	 * horrible lock 'holder' preemption issues.
+	 */
+
+	do {
+		while (atomic_read(&lock->val) != 0)
+			cpu_relax();
+	} while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
 
 	return true;
 }
+#endif /* CONFIG_PARAVIRT */
 
 #include <asm-generic/qspinlock.h>
 
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 7c7c27c97daa..1f3175bb994e 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -4,6 +4,7 @@
 #include <asm/sigcontext.h>
 #include <asm/siginfo.h>
 #include <asm/ucontext.h>
+#include <linux/compat.h>
 
 #ifdef CONFIG_X86_32
 #define sigframe_ia32		sigframe
@@ -69,6 +70,15 @@ struct rt_sigframe {
 
 #ifdef CONFIG_X86_X32_ABI
 
+struct ucontext_x32 {
+	unsigned int	  uc_flags;
+	unsigned int 	  uc_link;
+	compat_stack_t	  uc_stack;
+	unsigned int	  uc__pad0;     /* needed for alignment */
+	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
+};
+
 struct rt_sigframe_x32 {
 	u64 pretcode;
 	struct ucontext_x32 uc;
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab867e6d3..c481be78fcf1 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -30,7 +30,7 @@ typedef sigset_t compat_sigset_t;
 #endif /* __ASSEMBLY__ */
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
+extern void do_signal(struct pt_regs *regs);
 
 #define __ARCH_HAS_SA_RESTORER
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2e00bb2a136..58505f01962f 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
 	 * on during the bootup the random pool has true entropy too.
 	 */
 	get_random_bytes(&canary, sizeof(canary));
-	tsc = __native_read_tsc();
+	tsc = rdtsc();
 	canary += tsc + (tsc << 32UL);
 
 	current->stack_canary = canary;
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 592a6a672e07..91dfcafe27a6 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
 asmlinkage unsigned long sys_sigreturn(void);
 
 /* kernel/vm86_32.c */
+struct vm86_struct;
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..8afdc3e44247 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
  * Without this offset, that can result in a page fault.  (We are
  * careful that, in this case, the value we read doesn't matter.)
  *
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
  *
  * x86_64 has a fixed-length stack frame.
  */
 #ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+#  define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
 #else
 # define TOP_OF_KERNEL_STACK_PADDING 0
 #endif
@@ -140,27 +143,11 @@ struct thread_info {
 	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
 	 _TIF_NOHZ)
 
-/* work to do in syscall_trace_leave() */
-#define _TIF_WORK_SYSCALL_EXIT	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\
-	 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK							\
-	(0x0000FFFF &							\
-	 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|			\
-	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
-
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
 	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\
 	_TIF_NOHZ)
 
-/* Only used for 64 bit */
-#define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |				\
-	 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
-
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif	/* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {		\
+	inc_irq_stat(irq_tlb_count);	\
+	local_flush_tlb();		\
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c5380bea2a36..c3496619740a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index aad56eb3bbe2..6d7c5479bcea 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -21,28 +21,12 @@ extern void disable_TSC(void);
 
 static inline cycles_t get_cycles(void)
 {
-	unsigned long long ret = 0;
-
 #ifndef CONFIG_X86_TSC
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtscll(ret);
-
-	return ret;
-}
 
-static __always_inline cycles_t vget_cycles(void)
-{
-	/*
-	 * We only do VDSOs on TSC capable CPUs, so this shouldn't
-	 * access boot_cpu_data (which is not VDSO-safe):
-	 */
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-	return (cycles_t)__native_read_tsc();
+	return rdtsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 1d8de3f3feca..1e491f3af317 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 
-
 #include <asm/ptrace.h>
 #include <uapi/asm/vm86.h>
 
@@ -28,43 +27,49 @@ struct kernel_vm86_regs {
 	unsigned short gs, __gsh;
 };
 
-struct kernel_vm86_struct {
-	struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 flags
+struct vm86 {
+	struct vm86plus_struct __user *user_vm86;
+	struct pt_regs regs32;
+	unsigned long veflags;
+	unsigned long veflags_mask;
+	unsigned long saved_sp0;
+
 	unsigned long flags;
 	unsigned long screen_bitmap;
 	unsigned long cpu_type;
 	struct revectored_struct int_revectored;
 	struct revectored_struct int21_revectored;
 	struct vm86plus_info_struct vm86plus;
-	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
-	long return-eip;        from call to vm86()
-	struct pt_regs oldregs;  user space registers as saved by syscall
- */
 };
 
 #ifdef CONFIG_VM86
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
 
 struct task_struct;
+
+#define free_vm86(t) do {				\
+	struct thread_struct *__t = (t);		\
+	if (__t->vm86 != NULL) {			\
+		kfree(__t->vm86);			\
+		__t->vm86 = NULL;			\
+	}						\
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ		 3
+#define LAST_VM86_IRQ		15
+
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
 void release_vm86_irqs(struct task_struct *);
 
 #else
@@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
+#define free_vm86(t) do { } while(0)
+
 #endif /* CONFIG_VM86 */
 
 #endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 608a79d5a466..e6911caf5bbf 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+	return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
 #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ca08a27b90b3..83aea8055119 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
 	return _hypercall1(int, tmem_op, op);
 }
 
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+	return _hypercall2(int, xenpmu_op, op, arg);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 3400dbaec3c3..62ca03ef5c65 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
  *
  * Guest OS interface to x86 Xen.
  *
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
  */
 
 #ifndef _ASM_X86_XEN_INTERFACE_H
 #define _ASM_X86_XEN_INTERFACE_H
 
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
 #ifdef __XEN__
 #define __DEFINE_GUEST_HANDLE(name, type) \
     typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
  * start of the GDT because some stupid OSes export hard-coded selector values
  * in their ABI. These hard-coded values are always near the start of the GDT,
  * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
  */
 #define FIRST_RESERVED_GDT_PAGE  14
 #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
 #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
 
 /*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
  * The privilege level specifies which modes may enter a trap via a software
  * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
  * privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
 DEFINE_GUEST_HANDLE_STRUCT(trap_info);
 
 struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
+	/*
+	 * Number of valid entries in the p2m table(s) anchored at
+	 * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+	 */
+	unsigned long max_pfn;
+	/*
+	 * Frame containing list of mfns containing list of mfns containing p2m.
+	 * A value of 0 indicates it has not yet been set up, ~0 indicates it
+	 * has been set to invalid e.g. due to the p2m being too large for the
+	 * 3-level p2m tree. In this case the linear mapper p2m list anchored
+	 * at p2m_vaddr is to be used.
+	 */
+	xen_pfn_t pfn_to_mfn_frame_list_list;
+	unsigned long nmi_reason;
+	/*
+	 * Following three fields are valid if p2m_cr3 contains a value
+	 * different from 0.
+	 * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+	 * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+	 * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+	 * of a L3 or L4 page table.
+	 * p2m_vaddr holds the virtual address of the linear p2m list. All
+	 * entries in the range [0...max_pfn[ are accessible via this pointer.
+	 * p2m_generation will be incremented by the guest before and after each
+	 * change of the mappings of the p2m list. p2m_generation starts at 0
+	 * and a value with the least significant bit set indicates that a
+	 * mapping update is in progress. This allows guest external software
+	 * (e.g. in Dom0) to verify that read mappings are consistent and
+	 * whether they have changed since the last check.
+	 * Modifying a p2m element in the linear p2m list is allowed via an
+	 * atomic write only.
+	 */
+	unsigned long p2m_cr3;		/* cr3 value of the p2m address space */
+	unsigned long p2m_vaddr;	/* virtual address of the p2m list */
+	unsigned long p2m_generation;	/* generation count of p2m mapping */
 };
 #endif	/* !__ASSEMBLY__ */
 
@@ -137,13 +197,31 @@ struct arch_shared_info {
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
  */
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
 #endif
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+	/*
+	 * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+	 * For PV(H) guests these fields are RO.
+	 */
+	uint32_t counters;
+	uint32_t ctrls;
+
+	/* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+	uint64_t regs[];
+#elif defined(__GNUC__)
+	uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+	uint64_t counter;
+	uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+	/*
+	 * Offsets to fixed and architectural counter MSRs (relative to
+	 * xen_pmu_arch.c.intel).
+	 * For PV(H) guests these fields are RO.
+	 */
+	uint32_t fixed_counters;
+	uint32_t arch_counters;
+
+	/* PMU registers */
+	uint64_t global_ctrl;
+	uint64_t global_ovf_ctrl;
+	uint64_t global_status;
+	uint64_t fixed_ctrl;
+	uint64_t ds_area;
+	uint64_t pebs_enable;
+	uint64_t debugctl;
+
+	/* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+	uint64_t regs[];
+#elif defined(__GNUC__)
+	uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+	uint64_t ip;
+	uint64_t sp;
+	uint64_t flags;
+	uint16_t cs;
+	uint16_t ss;
+	uint8_t cpl;
+	uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED	   (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER	   (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL	   (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV	   (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+	union {
+		/*
+		 * Processor's registers at the time of interrupt.
+		 * WO for hypervisor, RO for guests.
+		 */
+		struct xen_pmu_regs regs;
+		/*
+		 * Padding for adding new registers to xen_pmu_regs in
+		 * the future
+		 */
+#define XENPMU_REGS_PAD_SZ  64
+		uint8_t pad[XENPMU_REGS_PAD_SZ];
+	} r;
+
+	/* WO for hypervisor, RO for guest */
+	uint64_t pmu_flags;
+
+	/*
+	 * APIC LVTPC register.
+	 * RW for both hypervisor and guest.
+	 * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+	 * during XENPMU_flush or XENPMU_lvtpc_set.
+	 */
+	union {
+		uint32_t lapic_lvtpc;
+		uint64_t pad;
+	} l;
+
+	/*
+	 * Vendor-specific PMU registers.
+	 * RW for both hypervisor and guest (see exceptions above).
+	 * Guest's updates to this field are verified and then loaded by the
+	 * hypervisor into hardware during XENPMU_flush
+	 */
+	union {
+		struct xen_pmu_amd_ctxt amd;
+		struct xen_pmu_intel_ctxt intel;
+
+		/*
+		 * Padding for contexts (fixed parts only, does not include
+		 * MSR banks that are specified by offsets)
+		 */
+#define XENPMU_CTXT_PAD_SZ  128
+		uint8_t pad[XENPMU_CTXT_PAD_SZ];
+	} c;
+};
+
 #endif	/* !__ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c44a5d53e464..0679e11d2cf7 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
 #define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)
 
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES						\
-    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
 
 extern unsigned long *machine_to_phys_mapping;
 extern unsigned long  machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long  xen_max_p2m_pfn;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
-					     unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+						    unsigned long pfn_e);
 
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
@@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
 	unsigned long mfn;
 
+	/*
+	 * Some x86 code are still using pfn_to_mfn instead of
+	 * pfn_to_mfn. This will have to be removed when we figured
+	 * out which call.
+	 */
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
 
@@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 
+	/*
+	 * Some x86 code are still using mfn_to_pfn instead of
+	 * gfn_to_pfn. This will have to be removed when we figure
+	 * out which call.
+	 */
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 
@@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
 	return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
 }
 
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return pfn;
+	else
+		return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return gfn;
+	else
+		return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn)		pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn)		gfn_to_pfn(bfn)
+
 /*
  * We detect special mappings in one of two ways:
  *  1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  *      require. In all the cases we care about, the FOREIGN_FRAME bit is
  *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
  */
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 
@@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 #define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g)		(__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
 static inline unsigned long pte_mfn(pte_t pte)
 {
 	return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);
 
 static inline bool xen_arch_need_swiotlb(struct device *dev,
 					 unsigned long pfn,
-					 unsigned long mfn)
+					 unsigned long bfn)
 {
 	return false;
 }
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ab456dc233b5..329254373479 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -120,7 +120,7 @@ struct boot_params {
 	__u8  _pad3[16];				/* 0x070 */
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
 	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
-	struct sys_desc_table sys_desc_table;		/* 0x0a0 */
+	struct sys_desc_table sys_desc_table; /* obsolete! */	/* 0x0a0 */
 	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
 	__u32 ext_ramdisk_image;			/* 0x0c0 */
 	__u32 ext_ramdisk_size;				/* 0x0c4 */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 0f457e6eab18..9dafe59cf6e2 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -37,7 +37,7 @@
 /*
  * This is a non-standardized way to represent ADR or NVDIMM regions that
  * persist over a reboot.  The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
  *
  * ( Note that older platforms also used 6 for the same type of memory,
  *   but newer versions switched to 12 as 6 was assigned differently.  Some
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 180a0c3c224d..79887abcb5e1 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -37,8 +37,6 @@
 #define X86_EFLAGS_VM		_BITUL(X86_EFLAGS_VM_BIT)
 #define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
 #define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
-#define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
-#define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
 #define X86_EFLAGS_VIF_BIT	19 /* Virtual Interrupt Flag */
 #define X86_EFLAGS_VIF		_BITUL(X86_EFLAGS_VIF_BIT)
 #define X86_EFLAGS_VIP_BIT	20 /* Virtual Interrupt Pending */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
@@ -69,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-y				+= kprobes/
@@ -92,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
@@ -107,8 +109,6 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
-obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
-obj-$(CONFIG_PMC_ATOM)			+= pmc_atom.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e49ee24da85e..ded848c20e05 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
 	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+	acpi_penalize_sci_irq(bus_irq, trigger, polarity);
 
 	/*
 	 * stash over-ride to indicate we've been here
@@ -710,7 +711,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 #endif
 }
 
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
 {
 	int cpu;
 
@@ -726,12 +727,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 	*pcpu = cpu;
 	return 0;
 }
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
 EXPORT_SYMBOL(acpi_map_cpu);
 
 int acpi_unmap_cpu(int cpu)
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ede92c3364d3..222a57076039 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = __native_read_tsc();
+	t1 = rdtsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = __native_read_tsc();
+	t2 = rdtsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index cde732c1b495..3ca3e46aa405 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,45 +457,45 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	rdtscll(tsc);
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
 
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
 {
-	unsigned long flags;
 	unsigned int v;
 
 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
+		return 0;
 
-	local_irq_save(flags);
+	v = apic_read(APIC_LVTT);
+	v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write(APIC_LVTT, v);
+	apic_write(APIC_TMICT, 0);
+	return 0;
+}
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(lapic_timer_frequency,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		apic_write(APIC_TMICT, 0);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
 
-	local_irq_restore(flags);
+	__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+	return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, true);
 }
 
 /*
@@ -513,15 +513,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
  * The local apic timer can be used for any function which is CPU local.
  */
 static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
+	.name			= "lapic",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+				  | CLOCK_EVT_FEAT_DUMMY,
+	.shift			= 32,
+	.set_state_shutdown	= lapic_timer_shutdown,
+	.set_state_periodic	= lapic_timer_set_periodic,
+	.set_state_oneshot	= lapic_timer_set_oneshot,
+	.set_next_event		= lapic_next_event,
+	.broadcast		= lapic_timer_broadcast,
+	.rating			= 100,
+	.irq			= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
@@ -592,7 +595,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -778,7 +781,7 @@ static int __init calibrate_APIC_clock(void)
 		 * Setup the apic timer manually
 		 */
 		levt->event_handler = lapic_cal_handler;
-		lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+		lapic_timer_set_periodic(levt);
 		lapic_cal_loops = -1;
 
 		/* Let the interrupts run */
@@ -788,7 +791,8 @@ static int __init calibrate_APIC_clock(void)
 			cpu_relax();
 
 		/* Stop the lapic timer */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+		local_irq_disable();
+		lapic_timer_shutdown(levt);
 
 		/* Jiffies delta */
 		deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -799,8 +803,8 @@ static int __init calibrate_APIC_clock(void)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
 			levt->features |= CLOCK_EVT_FEAT_DUMMY;
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
 
 	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		pr_warning("APIC timer disabled due to verification failure\n");
@@ -878,7 +882,7 @@ static void local_apic_timer_interrupt(void)
 	if (!evt->event_handler) {
 		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
 		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		lapic_timer_shutdown(evt);
 		return;
 	}
 
@@ -1209,7 +1213,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1297,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index de918c410eae..f92ab36979a2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -191,7 +191,6 @@ static struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
@@ -299,7 +298,6 @@ static struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b205cdbdbe6a..0d96749cfcac 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -152,7 +152,6 @@ struct apic apic_noop = {
 
 	.wakeup_secondary_cpu		= noop_wakeup_secondary_cpu,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 017149cded07..b548fd3b764b 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -92,7 +92,6 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 
 	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
@@ -235,7 +234,6 @@ static const struct apic apic_numachip __refconst = {
 	.send_IPI_self			= numachip_send_IPI_self,
 
 	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL, /* REMRD not supported */
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index c4a8d63f8220..971cf8875939 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -186,7 +186,6 @@ static struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..045e424fb368 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #endif
 
 #ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
 {
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
+	apic->send_IPI_mask(mask, NMI_VECTOR);
 }
 
 void arch_trigger_all_cpu_backtrace(bool include_self)
 {
-	struct nmi_seq_buf *s;
-	int len;
-	int cpu;
-	int i;
-	int this_cpu = get_cpu();
-
-	if (test_and_set_bit(0, &backtrace_flag)) {
-		/*
-		 * If there is already a trigger_all_cpu_backtrace() in progress
-		 * (backtrace_flag == 1), don't output double cpu dump infos.
-		 */
-		put_cpu();
-		return;
-	}
-
-	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-	if (!include_self)
-		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
-	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-		pr_info("sending NMI to %s CPUs:\n",
-			(include_self ? "all" : "other"));
-		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
-	}
-
-	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
-	for (i = 0; i < 10 * 1000; i++) {
-		if (cpumask_empty(to_cpumask(backtrace_mask)))
-			break;
-		mdelay(1);
-		touch_softlockup_watchdog();
-	}
-
-	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
-	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int last_i = 0;
-
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
-	put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
+	nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
 }
 
 static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	int cpu;
-
-	cpu = smp_processor_id();
-
-	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
-		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-		show_regs(regs);
-		this_cpu_write(printk_func, printk_func_save);
-
-		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+	if (nmi_cpu_backtrace(regs))
 		return NMI_HANDLED;
-	}
 
 	return NMI_DONE;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 206052e55517..38a76f826530 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void)
 		 * Honour affinities which have been set in early boot
 		 */
 		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-			mask = idata->affinity;
+			mask = irq_data_get_affinity_mask(idata);
 		else
 			mask = apic->target_cpus();
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 1a9d735e09c6..5f1feb6854af 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain)
 
 static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	hpet_msi_write(data->handler_data, msg);
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
 }
 
 static struct irq_chip hpet_msi_controller = {
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index bda488680dbc..7694ae6c1199 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -111,7 +111,6 @@ static struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 2683f36e4e0a..1bbd0fe2c806 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,8 +169,7 @@ next:
 			goto next;
 
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
-			if (per_cpu(vector_irq, new_cpu)[vector] >
-			    VECTOR_UNDEFINED)
+			if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
 				goto next;
 		}
 		/* Found one! */
@@ -182,7 +181,7 @@ next:
 			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+			per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
 		d->cfg.vector = vector;
 		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
@@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node,
 
 static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
-	int cpu, vector;
+	struct irq_desc *desc;
 	unsigned long flags;
+	int cpu, vector;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	BUG_ON(!data->cfg.vector);
 
 	vector = data->cfg.vector;
 	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 
 	data->cfg.vector = 0;
 	cpumask_clear(data->domain);
@@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
 		return;
 	}
 
+	desc = irq_to_desc(irq);
 	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
-			if (per_cpu(vector_irq, cpu)[vector] != irq)
+			if (per_cpu(vector_irq, cpu)[vector] != desc)
 				continue;
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 			break;
 		}
 	}
@@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	struct irq_alloc_info *info = arg;
 	struct apic_chip_data *data;
 	struct irq_data *irq_data;
-	int i, err;
+	int i, err, node;
 
 	if (disable_apic)
 		return -ENXIO;
@@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	for (i = 0; i < nr_irqs; i++) {
 		irq_data = irq_domain_get_irq_data(domain, virq + i);
 		BUG_ON(!irq_data);
+		node = irq_data_get_node(irq_data);
 #ifdef	CONFIG_X86_IO_APIC
 		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
 			data = legacy_irq_data[virq + i];
 		else
 #endif
-			data = alloc_apic_chip_data(irq_data->node);
+			data = alloc_apic_chip_data(node);
 		if (!data) {
 			err = -ENOMEM;
 			goto error;
@@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		irq_data->chip = &lapic_controller;
 		irq_data->chip_data = data;
 		irq_data->hwirq = virq + i;
-		err = assign_irq_vector_policy(virq + i, irq_data->node, data,
-					       info);
+		err = assign_irq_vector_policy(virq + i, node, data, info);
 		if (err)
 			goto error;
 	}
@@ -403,32 +404,32 @@ int __init arch_early_irq_init(void)
 	return arch_early_ioapic_init();
 }
 
+/* Initialize vector_irq on a new cpu */
 static void __setup_vector_irq(int cpu)
 {
-	/* Initialize vector_irq on a new cpu */
-	int irq, vector;
 	struct apic_chip_data *data;
+	struct irq_desc *desc;
+	int irq, vector;
 
 	/* Mark the inuse vectors */
-	for_each_active_irq(irq) {
-		data = apic_chip_data(irq_get_irq_data(irq));
-		if (!data)
-			continue;
+	for_each_irq_desc(irq, desc) {
+		struct irq_data *idata = irq_desc_get_irq_data(desc);
 
-		if (!cpumask_test_cpu(cpu, data->domain))
+		data = apic_chip_data(idata);
+		if (!data || !cpumask_test_cpu(cpu, data->domain))
 			continue;
 		vector = data->cfg.vector;
-		per_cpu(vector_irq, cpu)[vector] = irq;
+		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
-		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq <= VECTOR_UNDEFINED)
+		desc = per_cpu(vector_irq, cpu)[vector];
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
-		data = apic_chip_data(irq_get_irq_data(irq));
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!cpumask_test_cpu(cpu, data->domain))
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	}
 }
 
@@ -448,7 +449,7 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
 
 	__setup_vector_irq(cpu);
 }
@@ -490,7 +491,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
 	if (err) {
 		struct irq_data *top = irq_get_irq_data(irq);
 
-		if (assign_irq_vector(irq, data, top->affinity))
+		if (assign_irq_vector(irq, data,
+				      irq_data_get_affinity_mask(top)))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
@@ -538,27 +540,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 
 	entering_ack_irq();
 
+	/* Prevent vectors vanishing under us */
+	raw_spin_lock(&vector_lock);
+
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		int irq;
-		unsigned int irr;
-		struct irq_desc *desc;
 		struct apic_chip_data *data;
+		struct irq_desc *desc;
+		unsigned int irr;
 
-		irq = __this_cpu_read(vector_irq[vector]);
-
-		if (irq <= VECTOR_UNDEFINED)
+	retry:
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
-		desc = irq_to_desc(irq);
-		if (!desc)
-			continue;
+		if (!raw_spin_trylock(&desc->lock)) {
+			raw_spin_unlock(&vector_lock);
+			cpu_relax();
+			raw_spin_lock(&vector_lock);
+			goto retry;
+		}
 
-		data = apic_chip_data(&desc->irq_data);
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!data)
-			continue;
-
-		raw_spin_lock(&desc->lock);
+			goto unlock;
 
 		/*
 		 * Check if the irq migration is in progress. If so, we
@@ -583,11 +588,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
 			goto unlock;
 		}
-		__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
+	raw_spin_unlock(&vector_lock);
+
 	exiting_irq();
 }
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ab3219b3fbda..cc8311c4d298 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata x2apic_cpu_notifier = {
+static struct notifier_block x2apic_cpu_notifier = {
 	.notifier_call = update_clusterinfo,
 };
 
@@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 3ffd925655e0..662e9150ea6f 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -128,7 +128,6 @@ static struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c8d92950bc04..4a139465f1d4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
@@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.send_IPI_self			= uv_send_IPI_self,
 
 	.wakeup_secondary_cpu		= uv_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 58118e207a69..145863d4d343 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
@@ -163,6 +163,5 @@ static int start_periodic_check_for_corruption(void)
 	schedule_delayed_work(&bios_check_work, 0);
 	return 0;
 }
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd3a4baffe50..4a70fc6d400a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include <asm/cpu.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
+#include <asm/delay.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;
 
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 
@@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		rdtscl(d);
+		d = rdtsc();
 		while (n--)
 			f_vide();
-		rdtscl(d2);
+		d2 = rdtsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		/* A random value per boot for bit slice [12:upper_bit) */
 		va_align.bits = get_random_int() & va_align.mask;
 	}
+
+	if (cpu_has(c, X86_FEATURE_MWAITX))
+		use_mwaitx_delay();
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb9e5df42dd2..07ce52c22ec8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
@@ -1185,10 +1186,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
+	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1200,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
-	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
@@ -1488,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
 	return boot_cpu_has(bit);
 }
 EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+	if (this_cpu->c_bsp_resume)
+		this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+	.resume		= bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+	register_syscore_ops(&cpu_syscore_ops);
+	return 0;
+}
+core_initcall(init_cpu_syscore);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c37dc37e8317..2584265d4745 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 *);
 	void		(*c_identify)(struct cpuinfo_x86 *);
 	void		(*c_detect_tlb)(struct cpuinfo_x86 *);
+	void		(*c_bsp_resume)(struct cpuinfo_x86 *);
 	int		c_x86_vendor;
 #ifdef CONFIG_X86_32
 	/* Optional vendor specific routine to obtain the cache size. */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 50163fa9034f..98a13db5f4be 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+	u64 epb;
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+	 * (x86_energy_perf_policy(8) is available to change it at run-time.)
+	 */
+	if (!cpu_has(c, X86_FEATURE_EPB))
+		return;
+
+	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+		return;
+
+	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+	/*
+	 * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+	 * so reinitialize it properly like during bootup:
+	 */
+	init_intel_energy_perf(c);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
 
-	/*
-	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
-	 * x86_energy_perf_policy(8) is available to change it at run-time
-	 */
-	if (cpu_has(c, X86_FEATURE_EPB)) {
-		u64 epb;
-
-		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
-			pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
-			pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
-			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
-			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		}
-	}
+	init_intel_energy_perf(c);
 }
 
 #ifdef CONFIG_X86_32
@@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = {
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
+	.c_bsp_resume	= intel_bsp_resume,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0f8f21c8284a..9d014b82a124 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -127,7 +127,7 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -974,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int worst = 0;
 	int severity;
@@ -1000,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	this_cpu_inc(mce_exception_count);
 
@@ -1166,7 +1165,7 @@ out:
 	local_irq_disable();
 	ist_end_non_atomic();
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
@@ -1754,7 +1753,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }
 
 static int mce_apei_read_done;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e61a..12402e10aeff 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296fbe..01dd8702880b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 3c986390058a..9e3f3c7dd5d7 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -459,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
 	.notifier_call	= mc_cpu_callback,
 };
 
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 8187b7247d1c..37ea89c11520 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 }
 
 #ifdef DEBUG
-static void __ref show_saved_mc(void)
+static void show_saved_mc(void)
 {
 	int i, j;
 	unsigned int sig, pf, rev, total_size, data_size, date;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f794bfa3c138..381c8b9b3a33 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -188,6 +188,7 @@ static void __init ms_hyperv_init_platform(void)
 
 	machine_ops.shutdown = hv_machine_shutdown;
 	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+	mark_tsc_unstable("running on Hyper-V");
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index e7ed0d8ebacb..f891b4750f04 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
 			     increment);
 }
-EXPORT_SYMBOL(mtrr_add);
 
 /**
  * mtrr_del_page - delete a memory type region
@@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
 }
-EXPORT_SYMBOL(mtrr_del);
 
 /**
  * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f56cf074d01a..66dd3fe99b82 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 
 		if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;
 
 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
@@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
 
 #include <asm/compat.h>
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f41e4dc78119..3fefebfbdf4b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -3630,7 +3630,10 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	watchdog_nmi_disable_all();
+	if (lockup_detector_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -3638,7 +3641,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_nmi_enable_all();
+	lockup_detector_resume();
 
 	get_online_cpus();
 
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 83741a71558f..bd3507da39f0 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb,
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static struct notifier_block cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index ce95676abd60..4d38416e2a7f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..88b4da373081 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
  */
 static unsigned long hpet_freq;
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
-	.name		= "hpet",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= hpet_legacy_set_mode,
-	.set_next_event = hpet_legacy_next_event,
-	.irq		= 0,
-	.rating		= 50,
-};
+static struct clock_event_device hpet_clockevent;
 
 static void hpet_stop_counter(void)
 {
@@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static void hpet_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
 {
 	unsigned int cfg, cmp, now;
 	uint64_t delta;
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		hpet_stop_counter();
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
-		delta >>= evt->shift;
-		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned int) delta;
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		hpet_writel(cmp, HPET_Tn_CMP(timer));
-		udelay(1);
-		/*
-		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
-		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
-		 * bit is automatically cleared after the first write.
-		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
-		 * Publication # 24674)
-		 */
-		hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
-		hpet_start_counter();
-		hpet_print_config();
-		break;
+	hpet_stop_counter();
+	delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+	delta >>= evt->shift;
+	now = hpet_readl(HPET_COUNTER);
+	cmp = now + (unsigned int)delta;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+	       HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+	hpet_writel(cmp, HPET_Tn_CMP(timer));
+	udelay(1);
+	/*
+	 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+	 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+	 * bit is automatically cleared after the first write.
+	 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+	 * Publication # 24674)
+	 */
+	hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+	hpet_start_counter();
+	hpet_print_config();
 
-	case CLOCK_EVT_MODE_ONESHOT:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_PERIODIC;
-		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+	return 0;
+}
 
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
 
-	case CLOCK_EVT_MODE_RESUME:
-		if (timer == 0) {
-			hpet_enable_legacy_int();
-		} else {
-			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
-			disable_irq(hdev->irq);
-			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-			enable_irq(hdev->irq);
-		}
-		hpet_print_config();
-		break;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
+
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_ENABLE;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+	if (!timer) {
+		hpet_enable_legacy_int();
+	} else {
+		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+		disable_irq(hdev->irq);
+		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+		enable_irq(hdev->irq);
 	}
+	hpet_print_config();
+
+	return 0;
 }
 
 static int hpet_next_event(unsigned long delta,
@@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
 	return res < HPET_MIN_CYCLES ? -ETIME : 0;
 }
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+	return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+	return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
 {
-	hpet_set_mode(mode, evt, 0);
+	return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+	return hpet_resume(evt, 0);
 }
 
 static int hpet_legacy_next_event(unsigned long delta,
@@ -416,6 +425,22 @@ static int hpet_legacy_next_event(unsigned long delta,
 }
 
 /*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name			= "hpet",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT,
+	.set_state_periodic	= hpet_legacy_set_periodic,
+	.set_state_oneshot	= hpet_legacy_set_oneshot,
+	.set_state_shutdown	= hpet_legacy_shutdown,
+	.tick_resume		= hpet_legacy_resume,
+	.set_next_event		= hpet_legacy_next_event,
+	.irq			= 0,
+	.rating			= 50,
+};
+
+/*
  * HPET MSI Support
  */
 #ifdef CONFIG_PCI_MSI
@@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* unmask it */
@@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data)
 
 void hpet_msi_mask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* mask it */
@@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 	msg->address_hi = 0;
 }
 
-static void hpet_msi_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-	hpet_set_mode(mode, evt, hdev->num);
+
+	return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_resume(evt, hdev->num);
 }
 
 static int hpet_msi_next_event(unsigned long delta,
@@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 
 	evt->rating = 110;
 	evt->features = CLOCK_EVT_FEAT_ONESHOT;
-	if (hdev->flags & HPET_DEV_PERI_CAP)
+	if (hdev->flags & HPET_DEV_PERI_CAP) {
 		evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+		evt->set_state_periodic = hpet_msi_set_periodic;
+	}
 
-	evt->set_mode = hpet_msi_set_mode;
+	evt->set_state_shutdown = hpet_msi_shutdown;
+	evt->set_state_oneshot = hpet_msi_set_oneshot;
+	evt->tick_resume = hpet_msi_resume;
 	evt->set_next_event = hpet_msi_next_event;
 	evt->cpumask = cpumask_of(hdev->cpu);
 
@@ -735,7 +785,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +795,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index f2b96de3c7c1..efb82f07b29c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    !clockevent_state_periodic(&i8253_clockevent))
 		return 0;
 
 	return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c7dfe1be784e..f8062aaf5df9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -139,10 +139,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Machine check polls\n");
 #endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
-	seq_printf(p, "%*s: ", prec, "HYP");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
-	seq_puts(p, "  Hypervisor callback interrupts\n");
+	if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+		seq_printf(p, "%*s: ", prec, "HYP");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				   irq_stats(j)->irq_hv_callback_count);
+		seq_puts(p, "  Hypervisor callback interrupts\n");
+	}
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
@@ -211,24 +214,38 @@ u64 arch_irq_stat(void)
 __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-
+	struct irq_desc * desc;
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
-	unsigned irq;
+
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */
 
 	entering_irq();
 
-	irq = __this_cpu_read(vector_irq[vector]);
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+	desc = __this_cpu_read(vector_irq[vector]);
 
-	if (!handle_irq(irq, regs)) {
+	if (!handle_irq(desc, regs)) {
 		ack_APIC_irq();
 
-		if (irq != VECTOR_RETRIGGERED) {
-			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+		if (desc != VECTOR_RETRIGGERED) {
+			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
 					     __func__, smp_processor_id(),
-					     vector, irq);
+					     vector);
 		} else {
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
 	}
 
@@ -330,10 +347,10 @@ static struct cpumask affinity_new, online_new;
  */
 int check_irq_vectors_for_cpu_disable(void)
 {
-	int irq, cpu;
 	unsigned int this_cpu, vector, this_count, count;
 	struct irq_desc *desc;
 	struct irq_data *data;
+	int cpu;
 
 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
@@ -341,47 +358,43 @@ int check_irq_vectors_for_cpu_disable(void)
 
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq >= 0) {
-			desc = irq_to_desc(irq);
-			if (!desc)
-				continue;
-
-			/*
-			 * Protect against concurrent action removal,
-			 * affinity changes etc.
-			 */
-			raw_spin_lock(&desc->lock);
-			data = irq_desc_get_irq_data(desc);
-			cpumask_copy(&affinity_new, data->affinity);
-			cpumask_clear_cpu(this_cpu, &affinity_new);
-
-			/* Do not count inactive or per-cpu irqs. */
-			if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
-				raw_spin_unlock(&desc->lock);
-				continue;
-			}
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
+			continue;
+		/*
+		 * Protect against concurrent action removal, affinity
+		 * changes etc.
+		 */
+		raw_spin_lock(&desc->lock);
+		data = irq_desc_get_irq_data(desc);
+		cpumask_copy(&affinity_new,
+			     irq_data_get_affinity_mask(data));
+		cpumask_clear_cpu(this_cpu, &affinity_new);
 
+		/* Do not count inactive or per-cpu irqs. */
+		if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
 			raw_spin_unlock(&desc->lock);
-			/*
-			 * A single irq may be mapped to multiple
-			 * cpu's vector_irq[] (for example IOAPIC cluster
-			 * mode).  In this case we have two
-			 * possibilities:
-			 *
-			 * 1) the resulting affinity mask is empty; that is
-			 * this the down'd cpu is the last cpu in the irq's
-			 * affinity mask, or
-			 *
-			 * 2) the resulting affinity mask is no longer
-			 * a subset of the online cpus but the affinity
-			 * mask is not zero; that is the down'd cpu is the
-			 * last online cpu in a user set affinity mask.
-			 */
-			if (cpumask_empty(&affinity_new) ||
-			    !cpumask_subset(&affinity_new, &online_new))
-				this_count++;
+			continue;
 		}
+
+		raw_spin_unlock(&desc->lock);
+		/*
+		 * A single irq may be mapped to multiple cpu's
+		 * vector_irq[] (for example IOAPIC cluster mode).  In
+		 * this case we have two possibilities:
+		 *
+		 * 1) the resulting affinity mask is empty; that is
+		 * this the down'd cpu is the last cpu in the irq's
+		 * affinity mask, or
+		 *
+		 * 2) the resulting affinity mask is no longer a
+		 * subset of the online cpus but the affinity mask is
+		 * not zero; that is the down'd cpu is the last online
+		 * cpu in a user set affinity mask.
+		 */
+		if (cpumask_empty(&affinity_new) ||
+		    !cpumask_subset(&affinity_new, &online_new))
+			this_count++;
 	}
 
 	count = 0;
@@ -400,8 +413,8 @@ int check_irq_vectors_for_cpu_disable(void)
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] < 0)
-					count++;
+			    IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+			    count++;
 		}
 	}
 
@@ -437,7 +450,7 @@ void fixup_irqs(void)
 		raw_spin_lock(&desc->lock);
 
 		data = irq_desc_get_irq_data(desc);
-		affinity = data->affinity;
+		affinity = irq_data_get_affinity_mask(data);
 		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
@@ -505,14 +518,13 @@ void fixup_irqs(void)
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;
 
-		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
 			continue;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		if (irr  & (1 << (vector % 32))) {
-			irq = __this_cpu_read(vector_irq[vector]);
+			desc = __this_cpu_read(vector_irq[vector]);
 
-			desc = irq_to_desc(irq);
 			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
@@ -523,7 +535,7 @@ void fixup_irqs(void)
 			raw_spin_unlock(&desc->lock);
 		}
 		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 	}
 }
 #endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index cd74f5978ab9..c80cf6699678 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,21 +148,21 @@ void do_softirq_own_stack(void)
 	call_on_stack(__do_softirq, isp);
 }
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
+	unsigned int irq;
 	int overflow;
 
 	overflow = check_stack_overflow();
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (IS_ERR_OR_NULL(desc))
 		return false;
 
+	irq = irq_desc_get_irq(desc);
 	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-		desc->handle_irq(irq, desc);
+		generic_handle_irq_desc(irq, desc);
 	}
 
 	return true;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index bc4604e500a3..ff16ccb918f2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 #endif
 }
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
-
 	stack_overflow_check(regs);
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (unlikely(IS_ERR_OR_NULL(desc)))
 		return false;
 
-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(irq_desc_get_irq(desc), desc);
 	return true;
 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a3a5e158ed69..1423ab1b0312 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };
 
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+		if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
 			return 1;
 	}
 
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
 
 	x86_init.irqs.intr_init();
 }
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..e565e0e4d216 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		if (init) {
 			/*
 			 * Jump label is enabled for the first time.
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca83f7ac388b..0f8a6bbaaa44 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -223,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));
 
-	/* Default sysdesc table */
-	params->sys_desc_table.length = 0;
-
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
@@ -536,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 	int ret;
 
 	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring, &trusted);
+				      system_trusted_keyring,
+				      VERIFYING_KEXEC_PE_SIGNATURE,
+				      &trusted);
 	if (ret < 0)
 		return ret;
 	if (!trusted)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
 	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
 	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d05bd2e2ee91..697f90db0e37 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 }
 
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;
 
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;
 
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;
 
 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
 
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 		/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 58bcfb67c01f..f68e48f5f6c2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index e1b013696dde..c89f50a76e97 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 353972c1946c..84b8ef82a159 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 }
 
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		      gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(dev))
-		return NULL;
-
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle,
-			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
-		    void *vaddr, dma_addr_t bus,
-		    struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	WARN_ON(irqs_disabled());       /* for portability */
+	*gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+	*gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
+	if (!*dev)
+		*dev = &x86_dma_fallback_dev;
+	if (!is_device_dma_capable(*dev))
+		return false;
+	return true;
 
-	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free)
-		ops->free(dev, size, vaddr, bus, attrs);
 }
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
 
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 64f90f53bb85..4f00b63d7ff3 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -3,80 +3,17 @@
  * Copyright (c) 2015, Intel Corporation.
  */
 #include <linux/platform_device.h>
-#include <linux/libnvdimm.h>
 #include <linux/module.h>
-#include <asm/e820.h>
-
-static void e820_pmem_release(struct device *dev)
-{
-	struct nvdimm_bus *nvdimm_bus = dev->platform_data;
-
-	if (nvdimm_bus)
-		nvdimm_bus_unregister(nvdimm_bus);
-}
-
-static struct platform_device e820_pmem = {
-	.name = "e820_pmem",
-	.id = -1,
-	.dev = {
-		.release = e820_pmem_release,
-	},
-};
-
-static const struct attribute_group *e820_pmem_attribute_groups[] = {
-	&nvdimm_bus_attribute_group,
-	NULL,
-};
-
-static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
-	&nd_region_attribute_group,
-	&nd_device_attribute_group,
-	NULL,
-};
 
 static __init int register_e820_pmem(void)
 {
-	static struct nvdimm_bus_descriptor nd_desc;
-	struct device *dev = &e820_pmem.dev;
-	struct nvdimm_bus *nvdimm_bus;
-	int rc, i;
-
-	rc = platform_device_register(&e820_pmem);
-	if (rc)
-		return rc;
-
-	nd_desc.attr_groups = e820_pmem_attribute_groups;
-	nd_desc.provider_name = "e820";
-	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
-	if (!nvdimm_bus)
-		goto err;
-	dev->platform_data = nvdimm_bus;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		struct resource res = {
-			.flags	= IORESOURCE_MEM,
-			.start	= ei->addr,
-			.end	= ei->addr + ei->size - 1,
-		};
-		struct nd_region_desc ndr_desc;
-
-		if (ei->type != E820_PRAM)
-			continue;
-
-		memset(&ndr_desc, 0, sizeof(ndr_desc));
-		ndr_desc.res = &res;
-		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
-		ndr_desc.numa_node = NUMA_NO_NODE;
-		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
-			goto err;
-	}
-
-	return 0;
-
- err:
-	dev_err(dev, "failed to register legacy persistent memory ranges\n");
-	platform_device_unregister(&e820_pmem);
-	return -ENXIO;
+	struct platform_device *pdev;
+
+	/*
+	 * See drivers/nvdimm/e820.c for the implementation, this is
+	 * simply here to trigger the module to load on demand.
+	 */
+	pdev = platform_device_alloc("e820_pmem", -1);
+	return platform_device_add(pdev);
 }
 device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d83740ab85b0..6d0e62ae8516 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,6 +30,7 @@
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
+#include <asm/vm86.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -111,6 +112,8 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
+	free_vm86(t);
+
 	fpu__drop(fpu);
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f73c962fe636..c13df2c735f8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f6b916387590..3c1bbcf12924 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
+#endif
 	}
 }
 
@@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 }
 
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9be72bc3613f..558f50edebca 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>
 
 #include "tls.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }
 
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 #endif
 
+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */
 
 #ifdef CONFIG_X86_64
 
@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
 	machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874bf999e..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
 	return ramdisk_size;
 }
 
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
 	/* Assume only end is not page aligned */
 	u64 ramdisk_image = get_ramdisk_image();
 	u64 ramdisk_size  = get_ramdisk_size();
 	u64 area_size     = PAGE_ALIGN(ramdisk_size);
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
 
 	/* We need to move the initrd down into directly mapped mem */
 	relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
 	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
 	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 
-	q = (char *)initrd_start;
-
-	/* Copy the initrd */
-	while (ramdisk_size) {
-		slop = ramdisk_image & ~PAGE_MASK;
-		clen = ramdisk_size;
-		if (clen > MAX_MAP_CHUNK-slop)
-			clen = MAX_MAP_CHUNK-slop;
-		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_memremap(mapaddr, clen+slop);
-		memcpy(q, p+slop, clen);
-		early_memunmap(p, clen+slop);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
+	copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
 
-	ramdisk_image = get_ramdisk_image();
-	ramdisk_size  = get_ramdisk_size();
 	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
 		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
@@ -916,11 +896,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	if (boot_params.sys_desc_table.length != 0) {
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 71820c42b6ce..da52e6bb5c7f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,11 +31,11 @@
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -632,6 +632,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	bool stepping, failed;
 	struct fpu *fpu = &current->thread.fpu;
 
+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -697,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
@@ -732,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000000..dc3c0b1c816f
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b1f3ed9c7a9e..e0c198e5f920 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
-atomic_t init_deasserted;
-
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -146,16 +144,11 @@ static void smp_callin(void)
 
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
-	 * we may get here before an INIT-deassert IPI reaches
-	 * our local APIC.  We have to wait for the IPI or we'll
-	 * lock up on an APIC access.
-	 *
-	 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
+	 * cpu_callout_mask guarantees we don't get here before
+	 * an INIT_deassert IPI reaches our local APIC, so it is
+	 * now safe to touch our local APIC.
 	 */
 	cpuid = smp_processor_id();
-	if (apic->wait_for_init_deassert && cpuid)
-		while (!atomic_read(&init_deasserted))
-			cpu_relax();
 
 	/*
 	 * (This works even if the APIC is not enabled.)
@@ -620,7 +613,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	send_status = safe_apic_wait_icr_idle();
 
 	mb();
-	atomic_set(&init_deasserted, 1);
 
 	/*
 	 * Should we send STARTUP IPIs ?
@@ -665,7 +657,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(300);
+		if (init_udelay)
+			udelay(300);
 
 		pr_debug("Startup point 1\n");
 
@@ -675,7 +668,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(200);
+		if (init_udelay)
+			udelay(200);
 
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
@@ -859,8 +853,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	 * the targeted processor.
 	 */
 
-	atomic_set(&init_deasserted, 0);
-
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 
 		pr_debug("Setting warm reset code and vector.\n");
@@ -898,7 +890,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
 	if (!boot_error) {
 		/*
-		 * Wait 10s total for a response from AP
+		 * Wait 10s total for first sign of life from AP
 		 */
 		boot_error = -1;
 		timeout = jiffies + 10*HZ;
@@ -911,7 +903,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 				boot_error = 0;
 				break;
 			}
-			udelay(100);
 			schedule();
 		}
 	}
@@ -927,7 +918,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 			 * for the MTRR work(triggered by the AP coming online)
 			 * to be completed in the stop machine context.
 			 */
-			udelay(100);
 			schedule();
 		}
 	}
@@ -1358,7 +1348,7 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 }
 
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
 {
 	set_cpu_online(cpu, false);
 	cpumask_clear_cpu(cpu, cpu_callout_mask);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 0ccb53a9fcd9..c9a073866ca7 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 	}
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		mutex_unlock(&child->mm->context.lock);
 	}
+#endif
 
 	return addr;
 }
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 649b010da00b..12cbe2b88c0f 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug);
  *
  * This is only called for debugging CPU offline/online feature.
  */
-int __ref _debug_hotplug_cpu(int cpu, int action)
+int _debug_hotplug_cpu(int cpu, int action)
 {
 	struct device *dev = get_cpu_device(cpu);
 	int ret;
@@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void)
 late_initcall_sync(debug_hotplug_cpu);
 #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
 
-int __ref arch_register_cpu(int num)
+int arch_register_cpu(int num)
 {
 	struct cpuinfo_x86 *c = &cpu_data(num);
 
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 25b993729f9b..80bb24d9b880 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
  */
 u64 notrace trace_clock_x86_tsc(void)
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	rdtscll(ret);
-
-	return ret;
+	return rdtsc_ordered();
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c5a5231d1d11..346eec73f7db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_count_dec();
 }
 
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
 }
 
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 }
 
@@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
  * before ist_exit().
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		conditional_sti(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
-
-	exception_exit(prev_state);
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 #endif
 
-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
@@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);
 
 	if (!user_mode(regs))
@@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 	}
 
-exit:
-	exception_exit(prev_state);
 	return;
+
 exit_trap:
 	/*
 	 * This path out is for all the cases where we could not
@@ -447,35 +435,33 @@ exit_trap:
 	 * time..
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk = current;
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
-			goto exit;
+			return;
 
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk->thread.error_code = error_code;
@@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 		return;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_int3);
 
@@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
 
@@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
 
@@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
@@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 	}
 #endif
@@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 
@@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
@@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(prev_state);
 }
 #endif
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 88e9a38c71a5..c8d52cb4cb6e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -38,7 +38,7 @@ static int __read_mostly tsc_unstable;
    erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
 
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -274,7 +274,12 @@ done:
  */
 u64 native_sched_clock(void)
 {
-	u64 tsc_now;
+	if (static_branch_likely(&__use_tsc)) {
+		u64 tsc_now = rdtsc();
+
+		/* return the value in ns */
+		return cycles_2_ns(tsc_now);
+	}
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
@@ -284,16 +289,9 @@ u64 native_sched_clock(void)
 	 *   very important for it to be as fast as the platform
 	 *   can achieve it. )
 	 */
-	if (!static_key_false(&__use_tsc)) {
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-	}
-
-	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
 
-	/* return the value in ns */
-	return cycles_2_ns(tsc_now);
+	/* No locking but a rare wrong value is not a big deal: */
+	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 }
 
 /*
@@ -316,12 +314,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif
 
-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
@@ -984,7 +976,7 @@ static struct clocksource clocksource_tsc;
  */
 static cycle_t read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }
 
 /*
@@ -1218,7 +1210,7 @@ void __init tsc_init(void)
 	/* now allow native_sched_clock() to use rdtsc */
 
 	tsc_disabled = 0;
-	static_key_slow_inc(&__use_tsc);
+	static_branch_enable(&__use_tsc);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index dd8d0791dfb5..78083bf23ed1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;
 
 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
  */
 static void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;
 
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
 
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
 
 	/*
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	if (unsynchronized_tsc())
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;
 
+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..abd8b856bd2b 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,11 +44,14 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
 
 /*
  * Known problems:
@@ -66,10 +69,6 @@
  */
 
 
-#define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
-
-
 /*
  * 8- and 16-bit register defines..
  */
@@ -81,8 +80,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +89,13 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 	struct tss_struct *tss;
-	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
+	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,31 +104,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!current->thread.vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
-		pr_alert("could not access userspace vm86_info\n");
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+	user = vm86->user_vm86;
+
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86 info\n");
+		do_exit(SIGSEGV);
+	}
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = vm86->saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = KVM86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(vm86->regs32.gs);
 
-	return ret;
+	regs->pt.ax = retval;
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +192,16 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk = current;
-	int tmp;
-
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }
 
 
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
-
-	tsk = current;
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -256,114 +219,133 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }
 
 
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned long err = 0;
+
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(&vm86regs, 0, sizeof(vm86regs));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+	vm86->user_vm86 = user_vm86;
 
 /*
  * The flags register is also special: we cannot trust that the user
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;
+
+	vm86regs.pt.orig_ax = regs->orig_ax;
 
-	switch (info->cpu_type) {
+	switch (vm86->cpu_type) {
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
  */
-	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	vm86->saved_sp0 = tsk->thread.sp0;
+	lazy_save_gs(vm86->regs32.gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	tsk->thread.screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }
 
 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }
 
 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -405,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -420,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +500,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
+	struct vm86 *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
@@ -542,18 +525,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;
 
 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (VMPI.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +555,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
 
 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)
 
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 
@@ -622,7 +598,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 		}
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* popf */
 	case 0x9d:
@@ -642,16 +618,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 			set_vflags_short(newflags, regs);
 
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* int xx */
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		do_int(regs, intno, ssp, sp);
 		return;
@@ -682,14 +660,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 			set_vflags_short(newflags, regs);
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* cli */
 	case 0xfa:
 		IP(regs) = ip;
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* sti */
 	/*
@@ -701,14 +679,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 		IP(regs) = ip;
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;
 
 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
 	}
 
 	return;
 
+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
+	}
+
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
+	return;
+
 simulate_sigsegv:
 	/* FIXME: After a long discussion with Stas we finally
 	 *        agreed, that this is wrong. Here we should
@@ -720,7 +713,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }
 
 /* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e7a4fde5d631..b372a7557c16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	u16 sel;
 
 	la = seg_base(ctxt, addr.seg) + addr.ea;
+	*linear = la;
 	*max_size = 0;
 	switch (mode) {
 	case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	}
 	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
 		return emulate_gp(ctxt, 0);
-	*linear = la;
 	return X86EMUL_CONTINUE;
 bad:
 	if (addr.seg == VCPU_SREG_SS)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9a3e342e3cda..8d9013c5e1ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
-	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb16a8ea3dee..69088a1ba509 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
 	walk_shadow_page_lockless_begin(vcpu);
 
-	for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+	for (shadow_walk_init(&iterator, vcpu, addr),
+		 leaf = root = iterator.level;
 	     shadow_walk_okay(&iterator);
 	     __shadow_walk_next(&iterator, spte)) {
-		leaf = iterator.level;
 		spte = mmu_spte_get_lockless(iterator.sptep);
 
 		sptes[leaf - 1] = spte;
+		leaf--;
 
 		if (!is_shadow_present_pte(spte))
 			break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 	if (reserved) {
 		pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
 		       __func__, addr);
-		while (root >= leaf) {
+		while (root > leaf) {
 			pr_err("------ spte 0x%llx level %d.\n",
 			       sptes[root - 1], root);
 			root--;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74d825716f4f..fdb8cb63a6c0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1139,7 +1139,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 	u64 tsc;
 
-	tsc = svm_scale_tsc(vcpu, native_read_tsc());
+	tsc = svm_scale_tsc(vcpu, rdtsc());
 
 	return target_tsc - tsc;
 }
@@ -3174,7 +3174,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
 		msr_info->data = svm->vmcb->control.tsc_offset +
-			svm_scale_tsc(vcpu, native_read_tsc());
+			svm_scale_tsc(vcpu, rdtsc());
 
 		break;
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da1590ea43fc..d01986832afc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 	u64 host_tsc, tsc_offset;
 
-	rdtscll(host_tsc);
+	host_tsc = rdtsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 }
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
-	return target_tsc - native_read_tsc();
+	return target_tsc - rdtsc();
 }
 
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
@@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
 	if (r)
 		return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
 			   crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
 	synchronize_rcu();
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4bbc2a1676c9..a60bdbccff51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,20 +1441,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static cycle_t read_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
-
-	last = pvclock_gtod_data.clock.cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
@@ -1643,7 +1631,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 1;
 	}
 	if (!use_master_clock) {
-		host_tsc = native_read_tsc();
+		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
 	}
 
@@ -2620,7 +2608,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
-				native_read_tsc() - vcpu->arch.last_host_tsc;
+				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
@@ -2648,7 +2636,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_host_tsc = native_read_tsc();
+	vcpu->arch.last_host_tsc = rdtsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -5955,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 }
 
+#ifdef CONFIG_X86_64
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
 	struct kvm_segment seg;
@@ -5970,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 	put_smstate(u32, buf, offset + 4, seg.limit);
 	put_smstate(u64, buf, offset + 8, seg.base);
 }
+#endif
 
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
@@ -6387,7 +6377,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		hw_breakpoint_restore();
 
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
-							   native_read_tsc());
+							   rdtsc());
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -7196,7 +7186,7 @@ int kvm_arch_hardware_enable(void)
 	if (ret != 0)
 		return ret;
 
-	local_tsc = native_read_tsc();
+	local_tsc = rdtsc();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f2dc08c003eb..161804de124a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = {
 	.irq_unmask	= enable_lguest_irq,
 };
 
+/*
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
+ */
+static int lguest_setup_irq(unsigned int irq)
+{
+	struct irq_desc *desc;
+	int err;
+
+	/* Returns -ve error or vector number. */
+	err = irq_alloc_desc_at(irq, 0);
+	if (err < 0 && err != -EEXIST)
+		return err;
+
+	/*
+	 * Tell the Linux infrastructure that the interrupt is
+	 * controlled by our level-based lguest interrupt controller.
+	 */
+	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
+				      handle_level_irq, "level");
+
+	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
+	desc = irq_to_desc(irq);
+	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
+	return 0;
+}
+
 static int lguest_enable_irq(struct pci_dev *dev)
 {
+	int err;
 	u8 line = 0;
 
 	/* We literally use the PCI interrupt line as the irq number. */
 	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	irq_set_chip_and_handler_name(line, &lguest_irq_controller,
-				      handle_level_irq, "level");
-	dev->irq = line;
-	return 0;
+	err = lguest_setup_irq(line);
+	if (!err)
+		dev->irq = line;
+	return err;
 }
 
 /* We don't do hotplug PCI, so this shouldn't be called. */
@@ -855,17 +885,13 @@ static void lguest_disable_irq(struct pci_dev *dev)
 
 /*
  * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
+ * interrupt (except 128, which is used for system calls).
  */
 static void __init lguest_init_IRQ(void)
 {
 	unsigned int i;
 
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
 		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, irq_entries_start +
 					8 * (i - FIRST_EXTERNAL_VECTOR));
@@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-int lguest_setup_irq(unsigned int irq)
-{
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-	return 0;
-}
-
-/*
  * Time.
  *
  * It would be far better for everyone if the Guest had its own clock, but
@@ -985,23 +991,11 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
 	return 0;
 }
 
-static void lguest_clockevent_set_mode(enum clock_event_mode mode,
-                                      struct clock_event_device *evt)
+static int lguest_clockevent_shutdown(struct clock_event_device *evt)
 {
-	switch (mode) {
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		/* A 0 argument shuts the clock down. */
-		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* This is what we expect. */
-		break;
-	case CLOCK_EVT_MODE_PERIODIC:
-		BUG();
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
+	/* A 0 argument shuts the clock down. */
+	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
+	return 0;
 }
 
 /* This describes our primitive timer chip. */
@@ -1009,7 +1003,7 @@ static struct clock_event_device lguest_clockevent = {
 	.name                   = "lguest",
 	.features               = CLOCK_EVT_FEAT_ONESHOT,
 	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_mode               = lguest_clockevent_set_mode,
+	.set_state_shutdown	= lguest_clockevent_shutdown,
 	.rating                 = INT_MAX,
 	.mult                   = 1,
 	.shift                  = 0,
@@ -1040,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 static void lguest_time_init(void)
 {
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	lguest_setup_irq(0);
+	if (lguest_setup_irq(0) != 0)
+		panic("Could not set up timer irq");
 	irq_set_handler(0, lguest_time_irq);
 
 	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 39d6a3db0b96..e912b2f6d36e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
+#include <asm/mwait.h>
 
 #ifdef CONFIG_SMP
 # include <asm/smp.h>
@@ -49,16 +50,14 @@ static void delay_loop(unsigned long loops)
 /* TSC based delay: */
 static void delay_tsc(unsigned long __loops)
 {
-	u32 bclock, now, loops = __loops;
+	u64 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
 	cpu = smp_processor_id();
-	rdtsc_barrier();
-	rdtscl(bclock);
+	bclock = rdtsc_ordered();
 	for (;;) {
-		rdtsc_barrier();
-		rdtscl(now);
+		now = rdtsc_ordered();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -79,14 +78,51 @@ static void delay_tsc(unsigned long __loops)
 		if (unlikely(cpu != smp_processor_id())) {
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
-			rdtsc_barrier();
-			rdtscl(bclock);
+			bclock = rdtsc_ordered();
 		}
 	}
 	preempt_enable();
 }
 
 /*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+	u64 start, end, delay, loops = __loops;
+
+	start = rdtsc_ordered();
+
+	for (;;) {
+		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+		/*
+		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * accessed per-cpu variable as the monitor target.
+		 */
+		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+
+		/*
+		 * AMD, like Intel, supports the EAX hint and EAX=0xf
+		 * means, do not enter any deep C-state and we use it
+		 * here in delay() to minimize wakeup latency.
+		 */
+		__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+		end = rdtsc_ordered();
+
+		if (loops <= end - start)
+			break;
+
+		loops -= end - start;
+
+		start = end;
+	}
+}
+
+/*
  * Since we calibrate only once at boot, this
  * function should be set once at boot and not changed
  */
@@ -94,13 +130,19 @@ static void (*delay_fn)(unsigned long) = delay_loop;
 
 void use_tsc_delay(void)
 {
-	delay_fn = delay_tsc;
+	if (delay_fn == delay_loop)
+		delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+	delay_fn = delay_mwaitx;
 }
 
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		rdtscll(*timer_val);
+		*timer_val = rdtsc();
 		return 0;
 	}
 	return -1;
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 8300db71c2a6..8db26591d91a 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -20,6 +20,7 @@
 #include <linux/stddef.h>
 
 #include <asm/uaccess.h>
+#include <asm/vm86.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9dc909841739..eef44d9a3f77 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+#include <asm/vm86.h>			/* struct vm86			*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -301,14 +302,16 @@ static inline void
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 		 struct task_struct *tsk)
 {
+#ifdef CONFIG_VM86
 	unsigned long bit;
 
-	if (!v8086_mode(regs))
+	if (!v8086_mode(regs) || !tsk->thread.vm86)
 		return;
 
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
+		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
 }
 
 static bool low_pfn(unsigned long pfn)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8533b46e6bee..1d8a83df153a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -30,8 +30,11 @@
 /*
  * Tables translating between page_cache_type_t and pte encoding.
  *
- * Minimal supported modes are defined statically, they are modified
- * during bootup if more supported cache modes are available.
+ * The default values are defined statically as minimal supported mode;
+ * WC and WT fall back to UC-.  pat_init() updates these values to support
+ * more cache modes, WC and WT, when it is safe to do so.  See pat_init()
+ * for the details.  Note, __early_ioremap() used during early boot-time
+ * takes pgprot_t (pte encoding) and does not use these tables.
  *
  *   Index into __cachemode2pte_tbl[] is the cachemode.
  *
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8340e45c891a..7562f42914b4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end)
 
 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
+	pmd_idx = pmd_index(vaddr);
 
 	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
 		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
@@ -822,11 +823,11 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata = NODE_DATA(nid);
 	struct zone *zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623e3ba5..30564e2752d3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL);
+		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index db1b0bc5017c..134948b0926f 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-	unsigned long ret;
-	unsigned long addr, pgoff;
 	struct mm_struct *mm = current->mm;
-	vm_flags_t vm_flags;
-	struct vm_area_struct *vma;
+	unsigned long addr, populate;
 
 	/* Only bounds table can be allocated here */
 	if (len != mpx_bt_size_bytes(mm))
 		return -EINVAL;
 
 	down_write(&mm->mmap_sem);
-
-	/* Too many mappings? */
-	if (mm->map_count > sysctl_max_map_count) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	/* Obtain the address to map to. we verify (or select) it and ensure
-	 * that it represents a valid section of the address space.
-	 */
-	addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-	if (addr & ~PAGE_MASK) {
-		ret = addr;
-		goto out;
-	}
-
-	vm_flags = VM_READ | VM_WRITE | VM_MPX |
-			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-	/* Set pgoff according to addr for anon_vma */
-	pgoff = addr >> PAGE_SHIFT;
-
-	ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-	if (IS_ERR_VALUE(ret))
-		goto out;
-
-	vma = find_vma(mm, ret);
-	if (!vma) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if (vm_flags & VM_LOCKED) {
-		up_write(&mm->mmap_sem);
-		mm_populate(ret, len);
-		return ret;
-	}
-
-out:
+	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
 	up_write(&mm->mmap_sem);
-	return ret;
+	if (populate)
+		mm_populate(addr, populate);
+
+	return addr;
 }
 
 enum reg_type {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		bi->start = max(bi->start, low);
 		bi->end = min(bi->end, high);
 
-		/* and there's no empty block */
-		if (bi->start >= bi->end)
+		/* and there's no empty or non-exist block */
+		if (bi->start >= bi->end ||
+		    !memblock_overlaps_region(&memblock.memory,
+			bi->start, bi->end - bi->start))
 			numa_remove_memblk_from(i--, mi);
 	}
 
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 8ff686aa7e8c..5f169d5d76a8 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <linux/kernel.h>
+#include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 
@@ -256,5 +257,4 @@ static int start_pageattr_test(void)
 
 	return 0;
 }
-
-module_init(start_pageattr_test);
+device_initcall(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 727158cb3b3c..2c44c0792301 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -4,7 +4,6 @@
  */
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
-#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	info.flush_end = end;
 
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
 	if (is_uv_system()) {
 		unsigned int cpu;
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index be2e7a2b10d7..70efcd0940f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  *     goto out;
  *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
  *     goto out;
- *   prog = array->prog[index];
+ *   prog = array->ptrs[index];
  *   if (prog == NULL)
  *     goto out;
  *   goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 
-	/* prog = array->prog[index]; */
+	/* prog = array->ptrs[index]; */
 	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
-		    offsetof(struct bpf_array, prog));
+		    offsetof(struct bpf_array, ptrs));
 	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 
 	/* if (prog == NULL)
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* r9d = skb->len - skb->data_len (headlen)
+	 * r10 = skb->data
+	 */
+	/* mov %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+	/* sub %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+	/* mov %r10, off32(%rdi) */
+	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 
 	emit_prologue(&prog);
 
-	if (seen_ld_abs) {
-		/* r9d : skb->len - skb->data_len (headlen)
-		 * r10 : skb->data
-		 */
-		if (is_imm8(offsetof(struct sk_buff, len)))
-			/* mov %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x8b, 0x4f,
-			      offsetof(struct sk_buff, len));
-		else
-			/* mov %r9d, off32(%rdi) */
-			EMIT3_off32(0x44, 0x8b, 0x8f,
-				    offsetof(struct sk_buff, len));
-
-		if (is_imm8(offsetof(struct sk_buff, data_len)))
-			/* sub %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x2b, 0x4f,
-			      offsetof(struct sk_buff, data_len));
-		else
-			EMIT3_off32(0x44, 0x2b, 0x8f,
-				    offsetof(struct sk_buff, data_len));
-
-		if (is_imm8(offsetof(struct sk_buff, data)))
-			/* mov %r10, off8(%rdi) */
-			EMIT4(0x4c, 0x8b, 0x57,
-			      offsetof(struct sk_buff, data));
-		else
-			/* mov %r10, off32(%rdi) */
-			EMIT3_off32(0x4c, 0x8b, 0x97,
-				    offsetof(struct sk_buff, data));
-	}
+	if (seen_ld_abs)
+		emit_load_skb_data_hlen(&prog);
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
+		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
@@ -818,12 +811,18 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x52); /* push %r10 */
-				EMIT2(0x41, 0x51); /* push %r9 */
-				/* need to adjust jmp offset, since
-				 * pop %r9, pop %r10 take 4 bytes after call insn
-				 */
-				jmp_offset += 4;
+				reload_skb_data = bpf_helper_changes_skb_data(func);
+				if (reload_skb_data) {
+					EMIT1(0x57); /* push %rdi */
+					jmp_offset += 22; /* pop, mov, sub, mov */
+				} else {
+					EMIT2(0x41, 0x52); /* push %r10 */
+					EMIT2(0x41, 0x51); /* push %r9 */
+					/* need to adjust jmp offset, since
+					 * pop %r9, pop %r10 take 4 bytes after call insn
+					 */
+					jmp_offset += 4;
+				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd:			if (is_imm8(insn->off))
 			}
 			EMIT1_off32(0xE8, jmp_offset);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x59); /* pop %r9 */
-				EMIT2(0x41, 0x5A); /* pop %r10 */
+				if (reload_skb_data) {
+					EMIT1(0x5F); /* pop %rdi */
+					emit_load_skb_data_hlen(&prog);
+				} else {
+					EMIT2(0x41, 0x59); /* pop %r9 */
+					EMIT2(0x41, 0x5A); /* pop %r10 */
+				}
 			}
 			break;
 
@@ -1099,7 +1103,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	if (bpf_jit_enable > 1)
-		bpf_jit_dump(prog->len, proglen, 0, image);
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
 
 	if (image) {
 		bpf_flush_icache(header, image + proglen);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 22aaefb4f1ca..0d24e7c10145 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -35,6 +35,9 @@
 
 #define PCIE_CAP_OFFSET	0x100
 
+/* Quirks for the listed devices */
+#define PCI_DEVICE_ID_INTEL_MRFL_MMC	0x1190
+
 /* Fixed BAR fields */
 #define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00	/* Fixed BAR (TBD) */
 #define PCI_FIXED_BAR_0_SIZE	0x04
@@ -210,22 +213,41 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
 	struct irq_alloc_info info;
 	int polarity;
+	int ret;
 
 	if (pci_has_managed_irq(dev))
 		return 0;
 
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-		polarity = 0; /* active high */
-	else
-		polarity = 1; /* active low */
+	switch (intel_mid_identify_cpu()) {
+	case INTEL_MID_CPU_CHIP_TANGIER:
+		polarity = IOAPIC_POL_HIGH;
+
+		/* Special treatment for IRQ0 */
+		if (dev->irq == 0) {
+			/*
+			 * TNG has IRQ0 assigned to eMMC controller. But there
+			 * are also other devices with bogus PCI configuration
+			 * that have IRQ0 assigned. This check ensures that
+			 * eMMC gets it.
+			 */
+			if (dev->device != PCI_DEVICE_ID_INTEL_MRFL_MMC)
+				return -EBUSY;
+		}
+		break;
+	default:
+		polarity = IOAPIC_POL_LOW;
+		break;
+	}
+
 	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
-	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0)
-		return -EBUSY;
+	ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info);
+	if (ret < 0)
+		return ret;
 
 	dev->irq_managed = 1;
 
@@ -244,7 +266,7 @@ static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 	}
 }
 
-struct pci_ops intel_mid_pci_ops = {
+static struct pci_ops intel_mid_pci_ops = {
 	.read = pci_read,
 	.write = pci_write,
 };
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d22f4b5bbc04..ff31ab464213 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (ret)
 		goto error;
 	i = 0;
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
 					       (type == PCI_CAP_ID_MSI) ? nvec : 1,
 					       (type == PCI_CAP_ID_MSIX) ?
@@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		__pci_read_msi_msg(msidesc, &msg);
 		pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
@@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret = 0;
 	struct msi_desc *msidesc;
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		struct physdev_map_pirq map_irq;
 		domid_t domid;
 
@@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct msi_desc *msidesc;
 
-	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	msidesc = first_pci_msi_entry(dev);
 	if (msidesc->msi_attrib.is_msix)
 		xen_pci_frontend_disable_msix(dev);
 	else
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index f1a6c8e86ddd..184842ef332e 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y	+= efi/
 obj-y	+= geode/
 obj-y	+= goldfish/
 obj-y	+= iris/
+obj-y	+= intel/
 obj-y	+= intel-mid/
 obj-y	+= intel-quark/
 obj-y	+= olpc/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
index 0a3a40cbc794..40983f5b0858 100644
--- a/arch/x86/platform/atom/Makefile
+++ b/arch/x86/platform/atom/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
+obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
+obj-$(CONFIG_PUNIT_ATOM_DEBUG)	+= punit_atom_debug.o
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c
index d66a4fe6caee..964ff4fc61f9 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/platform/atom/pmc_atom.c
@@ -15,7 +15,6 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/device.h>
@@ -25,80 +24,149 @@
 
 #include <asm/pmc_atom.h>
 
+struct pmc_bit_map {
+	const char *name;
+	u32 bit_mask;
+};
+
+struct pmc_reg_map {
+	const struct pmc_bit_map *d3_sts_0;
+	const struct pmc_bit_map *d3_sts_1;
+	const struct pmc_bit_map *func_dis;
+	const struct pmc_bit_map *func_dis_2;
+	const struct pmc_bit_map *pss;
+};
+
 struct pmc_dev {
 	u32 base_addr;
 	void __iomem *regmap;
+	const struct pmc_reg_map *map;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
 #endif /* CONFIG_DEBUG_FS */
+	bool init;
 };
 
 static struct pmc_dev pmc_device;
 static u32 acpi_base_addr;
 
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
+static const struct pmc_bit_map d3_sts_0_map[] = {
+	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
+	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
+	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
+	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
+	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
+	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
+	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
+	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
+	{"SCC_EMMC",		BIT_SCC_EMMC},
+	{"SCC_SDIO",		BIT_SCC_SDIO},
+	{"SCC_SDCARD",		BIT_SCC_SDCARD},
+	{"SCC_MIPI",		BIT_SCC_MIPI},
+	{"HDA",			BIT_HDA},
+	{"LPE",			BIT_LPE},
+	{"OTG",			BIT_OTG},
+	{"USH",			BIT_USH},
+	{"GBE",			BIT_GBE},
+	{"SATA",		BIT_SATA},
+	{"USB_EHCI",		BIT_USB_EHCI},
+	{"SEC",			BIT_SEC},
+	{"PCIE_PORT0",		BIT_PCIE_PORT0},
+	{"PCIE_PORT1",		BIT_PCIE_PORT1},
+	{"PCIE_PORT2",		BIT_PCIE_PORT2},
+	{"PCIE_PORT3",		BIT_PCIE_PORT3},
+	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
+	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
+	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
+	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
+	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
+	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
+	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
+	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
+	{"USH_SS_PHY",		BIT_USH_SS_PHY},
+	{"DFX",			BIT_DFX},
+	{},
 };
 
-static const struct pmc_bit_map dev_map[] = {
-	{"0  - LPSS1_F0_DMA",		BIT_LPSS1_F0_DMA},
-	{"1  - LPSS1_F1_PWM1",		BIT_LPSS1_F1_PWM1},
-	{"2  - LPSS1_F2_PWM2",		BIT_LPSS1_F2_PWM2},
-	{"3  - LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"4  - LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"5  - LPSS1_F5_SPI",		BIT_LPSS1_F5_SPI},
-	{"6  - LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"7  - LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"8  - SCC_EMMC",		BIT_SCC_EMMC},
-	{"9  - SCC_SDIO",		BIT_SCC_SDIO},
-	{"10 - SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"11 - SCC_MIPI",		BIT_SCC_MIPI},
-	{"12 - HDA",			BIT_HDA},
-	{"13 - LPE",			BIT_LPE},
-	{"14 - OTG",			BIT_OTG},
-	{"15 - USH",			BIT_USH},
-	{"16 - GBE",			BIT_GBE},
-	{"17 - SATA",			BIT_SATA},
-	{"18 - USB_EHCI",		BIT_USB_EHCI},
-	{"19 - SEC",			BIT_SEC},
-	{"20 - PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"21 - PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"22 - PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"23 - PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"24 - LPSS2_F0_DMA",		BIT_LPSS2_F0_DMA},
-	{"25 - LPSS2_F1_I2C1",		BIT_LPSS2_F1_I2C1},
-	{"26 - LPSS2_F2_I2C2",		BIT_LPSS2_F2_I2C2},
-	{"27 - LPSS2_F3_I2C3",		BIT_LPSS2_F3_I2C3},
-	{"28 - LPSS2_F3_I2C4",		BIT_LPSS2_F4_I2C4},
-	{"29 - LPSS2_F5_I2C5",		BIT_LPSS2_F5_I2C5},
-	{"30 - LPSS2_F6_I2C6",		BIT_LPSS2_F6_I2C6},
-	{"31 - LPSS2_F7_I2C7",		BIT_LPSS2_F7_I2C7},
-	{"32 - SMB",			BIT_SMB},
-	{"33 - OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"34 - USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"35 - DFX",			BIT_DFX},
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_STS_GMM},
+	{"ISH",			BIT_STS_ISH},
+	{},
 };
 
-static const struct pmc_bit_map pss_map[] = {
-	{"0  - GBE",			PMC_PSS_BIT_GBE},
-	{"1  - SATA",			PMC_PSS_BIT_SATA},
-	{"2  - HDA",			PMC_PSS_BIT_HDA},
-	{"3  - SEC",			PMC_PSS_BIT_SEC},
-	{"4  - PCIE",			PMC_PSS_BIT_PCIE},
-	{"5  - LPSS",			PMC_PSS_BIT_LPSS},
-	{"6  - LPE",			PMC_PSS_BIT_LPE},
-	{"7  - DFX",			PMC_PSS_BIT_DFX},
-	{"8  - USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"9  - USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"10 - USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"11 - USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"12 - OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"13 - OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"14 - OTG_VCCA_CLK",		PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"15 - OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"16 - USB",			PMC_PSS_BIT_USB},
-	{"17 - USB_SUS",		PMC_PSS_BIT_USB_SUS},
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_FD_GMM},
+	{"ISH",			BIT_FD_ISH},
+	{},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
+	{"GBE",			PMC_PSS_BIT_GBE},
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"DFX",			PMC_PSS_BIT_DFX},
+	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
+	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
+	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
+	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
+	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
+	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
+	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
+	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
+	{"USB",			PMC_PSS_BIT_USB},
+	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{},
+};
+
+static const struct pmc_bit_map cht_pss_map[] = {
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"UFS",			PMC_PSS_BIT_CHT_UFS},
+	{"UXD",			PMC_PSS_BIT_CHT_UXD},
+	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
+	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
+	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
+	{"GMM",			PMC_PSS_BIT_CHT_GMM},
+	{"ISH",			PMC_PSS_BIT_CHT_ISH},
+	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
+	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+	{},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= byt_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= byt_d3_sts_1_map,
+	.pss		= byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= cht_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= cht_func_dis_2_map,
+	.pss		= cht_pss_map,
 };
 
 static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
@@ -111,6 +179,30 @@ static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
 	writel(val, pmc->regmap + reg_offset);
 }
 
+int pmc_atom_read(int offset, u32 *value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	*value = pmc_reg_read(pmc, offset);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	pmc_reg_write(pmc, offset, value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
 static void pmc_power_off(void)
 {
 	u16	pm1_cnt_port;
@@ -142,37 +234,39 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc)
 }
 
 #ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+				u32 sts, const struct pmc_bit_map *sts_map,
+				u32 fd, const struct pmc_bit_map *fd_map)
+{
+	int offset = PMC_REG_BIT_WIDTH * reg_index;
+	int index;
+
+	for (index = 0; sts_map[index].name; index++) {
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			offset + index, sts_map[index].name,
+			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
+			sts_map[index].bit_mask & sts ?  "D3" : "D0");
+	}
+}
+
 static int pmc_dev_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
-	u32 func_dis, func_dis_2, func_dis_index;
-	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int dev_num, dev_index, reg_index;
+	const struct pmc_reg_map *m = pmc->map;
+	u32 func_dis, func_dis_2;
+	u32 d3_sts_0, d3_sts_1;
 
 	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
 	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
 	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
 	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
 
-	dev_num = ARRAY_SIZE(dev_map);
-
-	for (dev_index = 0; dev_index < dev_num; dev_index++) {
-		reg_index = dev_index / PMC_REG_BIT_WIDTH;
-		if (reg_index) {
-			func_dis_index = func_dis_2;
-			d3_sts_index = d3_sts_1;
-		} else {
-			func_dis_index = func_dis;
-			d3_sts_index = d3_sts_0;
-		}
-
-		seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
-			dev_map[dev_index].name,
-			dev_map[dev_index].bit_mask & func_dis_index ?
-			"Disabled" : "Enabled ",
-			dev_map[dev_index].bit_mask & d3_sts_index ?
-			"D3" : "D0");
-	}
+	/* Low part */
+	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+	/* High part */
+	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
+
 	return 0;
 }
 
@@ -191,13 +285,14 @@ static const struct file_operations pmc_dev_state_ops = {
 static int pmc_pss_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->pss;
 	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int pss_index;
+	int index;
 
-	for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
-		seq_printf(s, "Island: %-32s\tState: %s\n",
-			pss_map[pss_index].name,
-			pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+	for (index = 0; map[index].name; index++) {
+		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+			index, map[index].name,
+			map[index].bit_mask & pss ? "Off" : "On");
 	}
 	return 0;
 }
@@ -250,7 +345,7 @@ static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
 	debugfs_remove_recursive(pmc->dbgfs_dir);
 }
 
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	struct dentry *dir, *f;
 
@@ -262,24 +357,18 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
 
 	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_dev_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "dev_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_pss_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "pss_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "sleep_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	return 0;
 err:
@@ -287,15 +376,16 @@ err:
 	return -ENODEV;
 }
 #else
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	return 0;
 }
 #endif /* CONFIG_DEBUG_FS */
 
-static int pmc_setup_dev(struct pci_dev *pdev)
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct pmc_dev *pmc = &pmc_device;
+	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
 	int ret;
 
 	/* Obtain ACPI base address */
@@ -315,32 +405,30 @@ static int pmc_setup_dev(struct pci_dev *pdev)
 		return -ENOMEM;
 	}
 
+	pmc->map = map;
+
 	/* PMC hardware registers setup */
 	pmc_hw_reg_setup(pmc);
 
-	ret = pmc_dbgfs_register(pmc, pdev);
-	if (ret) {
-		iounmap(pmc->regmap);
-	}
+	ret = pmc_dbgfs_register(pmc);
+	if (ret)
+		dev_warn(&pdev->dev, "debugfs register failed\n");
 
+	pmc->init = true;
 	return ret;
 }
 
 /*
  * Data for PCI driver interface
  *
- * This data only exists for exporting the supported
- * PCI ids via MODULE_DEVICE_TABLE.  We do not actually
- * register a pci_driver, because lpc_ich will register
- * a driver on the same PCI id.
+ * used by pci_match_id() call below.
  */
 static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
 	{ 0, },
 };
 
-MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
-
 static int __init pmc_atom_init(void)
 {
 	struct pci_dev *pdev = NULL;
@@ -357,15 +445,16 @@ static int __init pmc_atom_init(void)
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pmc_pci_ids, pdev);
 		if (ent)
-			return pmc_setup_dev(pdev);
+			return pmc_setup_dev(pdev, ent);
 	}
 	/* Device not found. */
 	return -ENODEV;
 }
 
-module_init(pmc_atom_init);
-/* no module_exit, this driver shouldn't be unloaded */
+device_initcall(pmc_atom_init);
 
+/*
 MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
 MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
 MODULE_LICENSE("GPL v2");
+*/
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *tmp, *p, *q = NULL;
 	int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *p;
 
diff --git a/arch/x86/platform/intel/Makefile b/arch/x86/platform/intel/Makefile
new file mode 100644
index 000000000000..b878032fbc82
--- /dev/null
+++ b/arch/x86/platform/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index 82f8d02f0df2..edf2c54bf131 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -30,7 +30,9 @@
 #define PCI_DEVICE_ID_BAYTRAIL		0x0F00
 #define PCI_DEVICE_ID_BRASWELL		0x2280
 #define PCI_DEVICE_ID_QUARK_X1000	0x0958
+#define PCI_DEVICE_ID_TANGIER		0x1170
 
+static struct pci_dev *mbi_pdev;
 static DEFINE_SPINLOCK(iosf_mbi_lock);
 
 static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
@@ -38,8 +40,6 @@ static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
 	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
 }
 
-static struct pci_dev *mbi_pdev;	/* one mbi device */
-
 static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
 {
 	int result;
@@ -104,7 +104,7 @@ int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -127,7 +127,7 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -151,7 +151,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -240,17 +240,17 @@ static void iosf_sideband_debug_init(void)
 
 	/* mdr */
 	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
-	if (IS_ERR_OR_NULL(d))
+	if (!d)
 		goto cleanup;
 
 	/* mcrx */
-	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
-	if (IS_ERR_OR_NULL(d))
+	d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+	if (!d)
 		goto cleanup;
 
 	/* mcr - initiates mailbox tranaction */
-	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
-	if (IS_ERR_OR_NULL(d))
+	d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+	if (!d)
 		goto cleanup;
 
 	return;
@@ -292,6 +292,7 @@ static const struct pci_device_id iosf_mbi_pci_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_TANGIER) },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
@@ -314,10 +315,8 @@ static void __exit iosf_mbi_exit(void)
 	iosf_debugfs_remove();
 
 	pci_unregister_driver(&iosf_mbi_pci_driver);
-	if (mbi_pdev) {
-		pci_dev_put(mbi_pdev);
-		mbi_pdev = NULL;
-	}
+	pci_dev_put(mbi_pdev);
+	mbi_pdev = NULL;
 }
 
 module_init(iosf_mbi_init);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 8570abe68be1..e1c24631afbb 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
-				 irq_data->node);
+				 irq_data_get_node(irq_data));
 	if (!chip_data)
 		return -ENOMEM;
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
 	touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 	uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	if (master)
 		pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index a244237f3cfa..2b158a9fa1d7 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -32,8 +32,7 @@
 
 static cycle_t uv_read_rtc(struct clocksource *cs);
 static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
-static void uv_rtc_timer_setup(enum clock_event_mode,
-				struct clock_event_device *);
+static int uv_rtc_shutdown(struct clock_event_device *evt);
 
 static struct clocksource clocksource_uv = {
 	.name		= RTC_NAME,
@@ -44,14 +43,14 @@ static struct clocksource clocksource_uv = {
 };
 
 static struct clock_event_device clock_event_device_uv = {
-	.name		= RTC_NAME,
-	.features	= CLOCK_EVT_FEAT_ONESHOT,
-	.shift		= 20,
-	.rating		= 400,
-	.irq		= -1,
-	.set_next_event	= uv_rtc_next_event,
-	.set_mode	= uv_rtc_timer_setup,
-	.event_handler	= NULL,
+	.name			= RTC_NAME,
+	.features		= CLOCK_EVT_FEAT_ONESHOT,
+	.shift			= 20,
+	.rating			= 400,
+	.irq			= -1,
+	.set_next_event		= uv_rtc_next_event,
+	.set_state_shutdown	= uv_rtc_shutdown,
+	.event_handler		= NULL,
 };
 
 static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
@@ -321,24 +320,14 @@ static int uv_rtc_next_event(unsigned long delta,
 }
 
 /*
- * Setup the RTC timer in oneshot mode
+ * Shutdown the RTC timer
  */
-static void uv_rtc_timer_setup(enum clock_event_mode mode,
-			       struct clock_event_device *evt)
+static int uv_rtc_shutdown(struct clock_event_device *evt)
 {
 	int ced_cpu = cpumask_first(evt->cpumask);
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here yet */
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		uv_rtc_unset_timer(ced_cpu, 1);
-		break;
-	}
+	uv_rtc_unset_timer(ced_cpu, 1);
+	return 0;
 }
 
 static void uv_rtc_interrupt(void)
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index b9531d343134..755481f14d90 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -45,17 +45,4 @@
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 484145368a24..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
 	depends on PARAVIRT
 	select PARAVIRT_CLOCK
 	select XEN_HAVE_PVMMU
+	select XEN_HAVE_VPMU
 	depends on X86_64 || (X86_32 && X86_PAE)
 	depends on X86_LOCAL_APIC && X86_TSC
 	help
@@ -23,14 +24,18 @@ config XEN_PVHVM
 	def_bool y
 	depends on XEN && PCI && X86_LOCAL_APIC
 
-config XEN_MAX_DOMAIN_MEMORY
-       int
-       default 500 if X86_64
-       default 64 if X86_32
-       depends on XEN
-       help
-         This only affects the sizing of some bss arrays, the unused
-         portions of which are freed.
+config XEN_512GB
+	bool "Limit Xen pv-domain memory to 512GB"
+	depends on XEN && X86_64
+	default y
+	help
+	  Limit paravirtualized user domains to 512GB of RAM.
+
+	  The Xen tools and crash dump analysis tools might not support
+	  pv-domains with more than 512 GB of RAM. This option controls the
+	  default setting of the kernel to use only up to 512 GB or more.
+	  It is always possible to change the default via specifying the
+	  boot parameter "xen_512gb_limit".
 
 config XEN_SAVE_RESTORE
        bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 4b6e29ac0968..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o			:= $(nostackp)
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
 			grant-table.o suspend.o platform-pci-unplug.o \
-			p2m.o apic.o
+			p2m.o apic.o pmu.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include "xen-ops.h"
+#include "pmu.h"
 #include "smp.h"
 
 static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
 
 static void xen_apic_write(u32 reg, u32 val)
 {
+	if (reg == APIC_LVTPC) {
+		(void)pmu_apic_update(reg);
+		return;
+	}
+
 	/* Warn to see if there's any stray references */
 	WARN(1,"register: %x, value: %x\n", reg, val);
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 11d6fb4e8483..30d12afe52ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
 #include "mmu.h"
 #include "smp.h"
 #include "multicalls.h"
+#include "pmu.h"
 
 EXPORT_SYMBOL_GPL(hypercall_page);
 
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr4(unsigned long cr4)
 {
-	cr4 &= ~X86_CR4_PGE;
-	cr4 &= ~X86_CR4_PSE;
+	cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
 
 	native_write_cr4(cr4);
 }
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
 {
 	u64 val;
 
+	if (pmu_msr_read(msr, &val, err))
+		return val;
+
 	val = native_read_msr_safe(msr, err);
 	switch (msr) {
 	case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 		   Xen console noise. */
 
 	default:
-		ret = native_write_msr_safe(msr, low, high);
+		if (!pmu_msr_write(msr, low, high, &ret))
+			ret = native_write_msr_safe(msr, low, high);
 	}
 
 	return ret;
@@ -1215,10 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
-	.read_tsc = native_read_tsc,
-	.read_pmc = native_read_pmc,
-
-	.read_tscp = native_read_tscp,
+	.read_pmc = xen_read_pmc,
 
 	.iret = xen_iret,
 #ifdef CONFIG_X86_64
@@ -1267,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
 static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		xen_pmu_finish(cpu);
 
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
 		BUG();
@@ -1610,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	early_boot_irqs_disabled = true;
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
-	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+				   xen_start_info->nr_pages);
+	xen_reserve_special_pages();
 
 	/*
 	 * Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..9c479fe40459 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
 				    unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
 	xen_mc_flush();
 }
 
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+	void *vaddr = __va(paddr);
+	void *vaddr_end = vaddr + size;
+
+	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+		make_lowmem_page_readwrite(vaddr);
+
+	memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+	if (unpin)
+		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+	ClearPagePinned(virt_to_page(__va(pa)));
+	xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+	unsigned long va = vaddr & PMD_MASK;
+	unsigned long pa;
+	pgd_t *pgd = pgd_offset_k(va);
+	pud_t *pud_page = pud_offset(pgd, 0);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned int i;
+	bool unpin;
+
+	unpin = (vaddr == 2 * PGDIR_SIZE);
+	set_pgd(pgd, __pgd(0));
+	do {
+		pud = pud_page + pud_index(va);
+		if (pud_none(*pud)) {
+			va += PUD_SIZE;
+		} else if (pud_large(*pud)) {
+			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+			xen_free_ro_pages(pa, PUD_SIZE);
+			va += PUD_SIZE;
+		} else {
+			pmd = pmd_offset(pud, va);
+			if (pmd_large(*pmd)) {
+				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+				xen_free_ro_pages(pa, PMD_SIZE);
+			} else if (!pmd_none(*pmd)) {
+				pte = pte_offset_kernel(pmd, va);
+				set_pmd(pmd, __pmd(0));
+				for (i = 0; i < PTRS_PER_PTE; ++i) {
+					if (pte_none(pte[i]))
+						break;
+					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+					xen_free_ro_pages(pa, PAGE_SIZE);
+				}
+				xen_cleanmfnmap_free_pgtbl(pte, unpin);
+			}
+			va += PMD_SIZE;
+			if (pmd_index(va))
+				continue;
+			set_pud(pud, __pud(0));
+			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+		}
+
+	} while (pud_index(va) || pmd_index(va));
+	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
 static void __init xen_pagetable_p2m_free(void)
 {
 	unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
 	/* using __ka address and sticking INVALID_P2M_ENTRY! */
 	memset((void *)xen_start_info->mfn_list, 0xff, size);
 
-	/* We should be in __ka space. */
-	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
 	addr = xen_start_info->mfn_list;
-	/* We roundup to the PMD, which means that if anybody at this stage is
-	 * using the __ka address of xen_start_info or xen_start_info->shared_info
-	 * they are in going to crash. Fortunatly we have already revectored
-	 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+	/*
+	 * We could be in __ka space.
+	 * We roundup to the PMD, which means that if anybody at this stage is
+	 * using the __ka address of xen_start_info or
+	 * xen_start_info->shared_info they are in going to crash. Fortunatly
+	 * we have already revectored in xen_setup_kernel_pagetable and in
+	 * xen_setup_shared_info.
+	 */
 	size = roundup(size, PMD_SIZE);
-	xen_cleanhighmap(addr, addr + size);
 
-	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-	memblock_free(__pa(xen_start_info->mfn_list), size);
+	if (addr >= __START_KERNEL_map) {
+		xen_cleanhighmap(addr, addr + size);
+		size = PAGE_ALIGN(xen_start_info->nr_pages *
+				  sizeof(unsigned long));
+		memblock_free(__pa(addr), size);
+	} else {
+		xen_cleanmfnmap(addr);
+	}
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+	unsigned long size;
+	unsigned long addr;
 
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
 
 #ifdef CONFIG_X86_64
 	xen_pagetable_p2m_free();
+
+	xen_pagetable_cleanhighmap();
 #endif
 	/* And revector! Bye bye old array */
 	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_writable_page_tables) ||
+	    xen_feature(XENFEAT_auto_translated_physmap) ||
+	    xen_start_info->mfn_list >= __START_KERNEL_map)
+		return pte;
+
+	/*
+	 * Pages belonging to the initial p2m list mapped outside the default
+	 * address range must be mapped read-only. This region contains the
+	 * page tables for mapping the p2m list, too, and page tables MUST be
+	 * mapped read-only.
+	 */
+	pfn = pte_pfn(pte);
+	if (pfn >= xen_start_info->first_p2m_pfn &&
+	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
 	return pte;
 }
 #endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 	native_set_pte(ptep, pte);
 }
 
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
-}
-
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	 * mappings. Considering that on Xen after the kernel mappings we
 	 * have the mappings of some pages that don't exist in pfn space, we
 	 * set max_pfn_mapped to the last real pfn mapped. */
-	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+	if (xen_start_info->mfn_list < __START_KERNEL_map)
+		max_pfn_mapped = xen_start_info->first_p2m_pfn;
+	else
+		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
 
 	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
 	pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Graft it onto L4[511][510] */
 	copy_page(level2_kernel_pgt, l2);
 
+	/* Copy the initial P->M table mappings if necessary. */
+	i = pgd_index(xen_start_info->mfn_list);
+	if (i && i < pgd_index(__START_KERNEL_map))
+		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/* Make pagetable pieces RO */
 		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 		check_pt_base(&pt_base, &pt_end, addr[i]);
 
 	/* Our (by three pages) smaller Xen pagetable that we are using */
-	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+	xen_pt_base = PFN_PHYS(pt_base);
+	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+	memblock_reserve(xen_pt_base, xen_pt_size);
+
 	/* Revector the xen_start_info */
 	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
 }
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+	unsigned long *vaddr;
+	unsigned long val;
+
+	vaddr = early_memremap_ro(addr, sizeof(val));
+	val = *vaddr;
+	early_memunmap(vaddr, sizeof(val));
+	return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+	phys_addr_t pa;
+	pgd_t pgd;
+	pud_t pud;
+	pmd_t pmd;
+	pte_t pte;
+
+	pa = read_cr3();
+	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+						       sizeof(pgd)));
+	if (!pgd_present(pgd))
+		return 0;
+
+	pa = pgd_val(pgd) & PTE_PFN_MASK;
+	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+						       sizeof(pud)));
+	if (!pud_present(pud))
+		return 0;
+	pa = pud_pfn(pud) << PAGE_SHIFT;
+	if (pud_large(pud))
+		return pa + (vaddr & ~PUD_MASK);
+
+	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+						       sizeof(pmd)));
+	if (!pmd_present(pmd))
+		return 0;
+	pa = pmd_pfn(pmd) << PAGE_SHIFT;
+	if (pmd_large(pmd))
+		return pa + (vaddr & ~PMD_MASK);
+
+	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+						       sizeof(pte)));
+	if (!pte_present(pte))
+		return 0;
+	pa = pte_pfn(pte) << PAGE_SHIFT;
+
+	return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+	pte_t *pt;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	unsigned long *new_p2m;
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+	n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+	n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+	new_area = xen_find_free_area(PFN_PHYS(n_frames));
+	if (!new_area) {
+		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+		BUG();
+	}
+
+	/*
+	 * Setup the page tables for addressing the new p2m list.
+	 * We have asked the hypervisor to map the p2m list at the user address
+	 * PUD_SIZE. It may have done so, or it may have used a kernel space
+	 * address depending on the Xen version.
+	 * To avoid any possible virtual address collision, just use
+	 * 2 * PUD_SIZE for the new area.
+	 */
+	pud_phys = new_area;
+	pmd_phys = pud_phys + PFN_PHYS(n_pud);
+	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+	pgd = __va(read_cr3());
+	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+		pud = early_memremap(pud_phys, PAGE_SIZE);
+		clear_page(pud);
+		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+		     idx_pmd++) {
+			pmd = early_memremap(pmd_phys, PAGE_SIZE);
+			clear_page(pmd);
+			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+			     idx_pt++) {
+				pt = early_memremap(pt_phys, PAGE_SIZE);
+				clear_page(pt);
+				for (idx_pte = 0;
+				     idx_pte < min(n_pte, PTRS_PER_PTE);
+				     idx_pte++) {
+					set_pte(pt + idx_pte,
+						pfn_pte(p2m_pfn, PAGE_KERNEL));
+					p2m_pfn++;
+				}
+				n_pte -= PTRS_PER_PTE;
+				early_memunmap(pt, PAGE_SIZE);
+				make_lowmem_page_readonly(__va(pt_phys));
+				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+						  PFN_DOWN(pt_phys));
+				set_pmd(pmd + idx_pt,
+					__pmd(_PAGE_TABLE | pt_phys));
+				pt_phys += PAGE_SIZE;
+			}
+			n_pt -= PTRS_PER_PMD;
+			early_memunmap(pmd, PAGE_SIZE);
+			make_lowmem_page_readonly(__va(pmd_phys));
+			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+					  PFN_DOWN(pmd_phys));
+			set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+			pmd_phys += PAGE_SIZE;
+		}
+		n_pmd -= PTRS_PER_PUD;
+		early_memunmap(pud, PAGE_SIZE);
+		make_lowmem_page_readonly(__va(pud_phys));
+		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+		pud_phys += PAGE_SIZE;
+	}
+
+	/* Now copy the old p2m info to the new area. */
+	memcpy(new_p2m, xen_p2m_addr, size);
+	xen_p2m_addr = new_p2m;
+
+	/* Release the old p2m list and set new list info. */
+	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+	BUG_ON(!p2m_pfn);
+	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+	if (xen_start_info->mfn_list < __START_KERNEL_map) {
+		pfn = xen_start_info->first_p2m_pfn;
+		pfn_end = xen_start_info->first_p2m_pfn +
+			  xen_start_info->nr_p2m_frames;
+		set_pgd(pgd + 1, __pgd(0));
+	} else {
+		pfn = p2m_pfn;
+		pfn_end = p2m_pfn_end;
+	}
+
+	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+	while (pfn < pfn_end) {
+		if (pfn == p2m_pfn) {
+			pfn = p2m_pfn_end;
+			continue;
+		}
+		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+		pfn++;
+	}
+
+	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
+	xen_start_info->nr_p2m_frames = n_frames;
+}
+
 #else	/* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
 }
 
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+	phys_addr_t pt_base, paddr;
+	unsigned pmdidx;
+
+	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+		if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+			paddr = m2p(pmd[pmdidx].pmd);
+			pt_base = min(pt_base, paddr);
+		}
+
+	return pt_base;
+}
+
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
+	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+	xen_pt_base = xen_find_pt_base(kernel_pmd);
+	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
 	initial_kernel_pmd =
 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
-	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
-				  xen_start_info->nr_pt_frames * PAGE_SIZE +
-				  512*1024);
+	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
 
-	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	copy_page(initial_kernel_pmd, kernel_pmd);
 
 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 			  PFN_DOWN(__pa(initial_page_table)));
 	xen_write_cr3(__pa(initial_page_table));
 
-	memblock_reserve(__pa(xen_start_info->pt_base),
-			 xen_start_info->nr_pt_frames * PAGE_SIZE);
+	memblock_reserve(xen_pt_base, xen_pt_size);
 }
 #endif	/* CONFIG_X86_64 */
 
+void __init xen_reserve_special_pages(void)
+{
+	phys_addr_t paddr;
+
+	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+	if (xen_start_info->store_mfn) {
+		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+		memblock_reserve(paddr, PAGE_SIZE);
+	}
+	if (!xen_initial_domain()) {
+		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+		memblock_reserve(paddr, PAGE_SIZE);
+	}
+}
+
+void __init xen_pt_check_e820(void)
+{
+	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+		BUG();
+	}
+}
+
 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
 
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
@@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	return 0;
 }
 
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
 			unsigned long addr,
-			xen_pfn_t *mfn, int nr,
+			xen_pfn_t *gfn, int nr,
 			int *err_ptr, pgprot_t prot,
 			unsigned domid,
 			struct page **pages)
@@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
 						 prot, domid, pages);
 #else
 		return -EINVAL;
 #endif
         }
 
-	rmd.mfn = mfn;
+	rmd.mfn = gfn;
 	rmd.prot = prot;
 	/* We use the err_ptr to indicate if there we are doing a contigious
 	 * mapping or a discontigious mapping. */
@@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 						    batch_left, &done, domid);
 
 			/*
-			 * @err_ptr may be the same buffer as @mfn, so
-			 * only clear it after each chunk of @mfn is
+			 * @err_ptr may be the same buffer as @gfn, so
+			 * only clear it after each chunk of @gfn is
 			 * used.
 			 */
 			if (err_ptr) {
@@ -2549,19 +2896,19 @@ out:
 	return err < 0 ? err : mapped;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
+			       xen_pfn_t gfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages)
 {
-	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t *mfn, int nr,
+			       xen_pfn_t *gfn, int nr,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid, struct page **pages)
 {
@@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
 	 * cause of "wrong memory was mapped in".
 	 */
 	BUG_ON(err_ptr == NULL);
-	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 
 /* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages)
 {
 	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 	return -EINVAL;
 #endif
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b7f18e200aa..bfc08b13044b 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
-#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
+#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN	(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
 #define PMDS_PER_MID_PAGE	(P2M_MID_PER_PAGE / PTRS_PER_PTE)
 
 unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
 	unsigned int level, topidx, mididx;
 	unsigned long *mid_mfn_p;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_feature(XENFEAT_auto_translated_physmap) ||
+	    xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
 		return;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
 
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
-	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		virt_to_mfn(p2m_top_mfn);
+	if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+	else
+		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+			virt_to_mfn(p2m_top_mfn);
 	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+	HYPERVISOR_shared_info->arch.p2m_generation = 0;
+	HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+	HYPERVISOR_shared_info->arch.p2m_cr3 =
+		xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
 
 		ptechk = lookup_address(vaddr, &level);
 		if (ptechk == pte_pg) {
+			HYPERVISOR_shared_info->arch.p2m_generation++;
+			wmb(); /* Tools are synchronizing via p2m_generation. */
 			set_pmd(pmdp,
 				__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+			wmb(); /* Tools are synchronizing via p2m_generation. */
+			HYPERVISOR_shared_info->arch.p2m_generation++;
 			pte_newpg[i] = NULL;
 		}
 
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  */
 static bool alloc_p2m(unsigned long pfn)
 {
-	unsigned topidx, mididx;
+	unsigned topidx;
 	unsigned long *top_mfn_p, *mid_mfn;
 	pte_t *ptep, *pte_pg;
 	unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
 	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
 	unsigned long p2m_pfn;
 
-	topidx = p2m_top_index(pfn);
-	mididx = p2m_mid_index(pfn);
-
 	ptep = lookup_address(addr, &level);
 	BUG_ON(!ptep || level != PG_LEVEL_4K);
 	pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
 			return false;
 	}
 
-	if (p2m_top_mfn) {
+	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+		topidx = p2m_top_index(pfn);
 		top_mfn_p = &p2m_top_mfn[topidx];
 		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
 
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
 		spin_lock_irqsave(&p2m_update_lock, flags);
 
 		if (pte_pfn(*ptep) == p2m_pfn) {
+			HYPERVISOR_shared_info->arch.p2m_generation++;
+			wmb(); /* Tools are synchronizing via p2m_generation. */
 			set_pte(ptep,
 				pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+			wmb(); /* Tools are synchronizing via p2m_generation. */
+			HYPERVISOR_shared_info->arch.p2m_generation++;
 			if (mid_mfn)
-				mid_mfn[mididx] = virt_to_mfn(p2m);
+				mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
 			p2m = NULL;
 		}
 
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 		return true;
 	}
 
+	/*
+	 * The interface requires atomic updates on p2m elements.
+	 * xen_safe_write_ulong() is using __put_user which does an atomic
+	 * store via asm().
+	 */
 	if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
 		return true;
 
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24ab72..000000000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES    10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
-                                      unsigned long pfn_e);
-
-#endif  /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index a8261716d58d..9586ff32810c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
 	return 0;
 }
 
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
 {
 	if (!xen_domain())
 		return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000000..724a08740a04
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING    1
+struct xenpmu {
+	/* Shared page between hypervisor and domain */
+	struct xen_pmu_data *xenpmu_data;
+
+	uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data()    (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags()   (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+					    (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS   6
+#define F10H_NUM_COUNTERS   4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER            0
+#define MSR_TYPE_CTRL               1
+#define MSR_TYPE_GLOBAL             2
+#define MSR_TYPE_ARCH_COUNTER       3
+#define MSR_TYPE_ARCH_CTRL          4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT        8
+#define PMU_GENERAL_NR_BITS         8
+#define PMU_GENERAL_NR_MASK         (((1 << PMU_GENERAL_NR_BITS) - 1) \
+				     << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT          0
+#define PMU_FIXED_NR_BITS           5
+#define PMU_FIXED_NR_MASK           (((1 << PMU_FIXED_NR_BITS) - 1) \
+				     << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK          (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT        30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+		switch (boot_cpu_data.x86) {
+		case 0x15:
+			amd_num_counters = F15H_NUM_COUNTERS;
+			amd_counters_base = MSR_F15H_PERF_CTR;
+			amd_ctrls_base = MSR_F15H_PERF_CTL;
+			amd_msr_step = 2;
+			k7_counters_mirrored = 1;
+			break;
+		case 0x10:
+		case 0x12:
+		case 0x14:
+		case 0x16:
+		default:
+			amd_num_counters = F10H_NUM_COUNTERS;
+			amd_counters_base = MSR_K7_PERFCTR0;
+			amd_ctrls_base = MSR_K7_EVNTSEL0;
+			amd_msr_step = 1;
+			k7_counters_mirrored = 0;
+			break;
+		}
+	} else {
+		uint32_t eax, ebx, ecx, edx;
+
+		cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+		intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+			PMU_GENERAL_NR_SHIFT;
+		intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+			PMU_FIXED_NR_SHIFT;
+	}
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+	switch (addr) {
+	case MSR_K7_PERFCTR0:
+	case MSR_K7_PERFCTR1:
+	case MSR_K7_PERFCTR2:
+	case MSR_K7_PERFCTR3:
+		return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_EVNTSEL1:
+	case MSR_K7_EVNTSEL2:
+	case MSR_K7_EVNTSEL3:
+		return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+	default:
+		break;
+	}
+
+	return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+	if ((msr >= MSR_F15H_PERF_CTL &&
+	     msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+	    (msr >= MSR_K7_EVNTSEL0 &&
+	     msr < MSR_K7_PERFCTR0 + amd_num_counters))
+		return true;
+
+	return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+	u32 msr_index_pmc;
+
+	switch (msr_index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_IA32_DS_AREA:
+	case MSR_IA32_PEBS_ENABLE:
+		*type = MSR_TYPE_CTRL;
+		return true;
+
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*type = MSR_TYPE_GLOBAL;
+		return true;
+
+	default:
+
+		if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+		    (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+				 intel_num_fixed_counters)) {
+			*index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+			*type = MSR_TYPE_COUNTER;
+			return true;
+		}
+
+		if ((msr_index >= MSR_P6_EVNTSEL0) &&
+		    (msr_index < MSR_P6_EVNTSEL0 +  intel_num_arch_counters)) {
+			*index = msr_index - MSR_P6_EVNTSEL0;
+			*type = MSR_TYPE_ARCH_CTRL;
+			return true;
+		}
+
+		msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+		if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+		    (msr_index_pmc < MSR_IA32_PERFCTR0 +
+				     intel_num_arch_counters)) {
+			*type = MSR_TYPE_ARCH_COUNTER;
+			*index = msr_index_pmc - MSR_IA32_PERFCTR0;
+			return true;
+		}
+		return false;
+	}
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+				  int index, bool is_read)
+{
+	uint64_t *reg = NULL;
+	struct xen_pmu_intel_ctxt *ctxt;
+	uint64_t *fix_counters;
+	struct xen_pmu_cntr_pair *arch_cntr_pair;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+		return false;
+
+	ctxt = &xenpmu_data->pmu.c.intel;
+
+	switch (msr) {
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		reg = &ctxt->global_ovf_ctrl;
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		reg = &ctxt->global_status;
+		break;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		reg = &ctxt->global_ctrl;
+		break;
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		reg = &ctxt->fixed_ctrl;
+		break;
+	default:
+		switch (type) {
+		case MSR_TYPE_COUNTER:
+			fix_counters = field_offset(ctxt, fixed_counters);
+			reg = &fix_counters[index];
+			break;
+		case MSR_TYPE_ARCH_COUNTER:
+			arch_cntr_pair = field_offset(ctxt, arch_counters);
+			reg = &arch_cntr_pair[index].counter;
+			break;
+		case MSR_TYPE_ARCH_CTRL:
+			arch_cntr_pair = field_offset(ctxt, arch_counters);
+			reg = &arch_cntr_pair[index].control;
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (reg) {
+		if (is_read)
+			*val = *reg;
+		else {
+			*reg = *val;
+
+			if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+				ctxt->global_status &= (~(*val));
+		}
+		return true;
+	}
+
+	return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+	uint64_t *reg = NULL;
+	int i, off = 0;
+	struct xen_pmu_amd_ctxt *ctxt;
+	uint64_t *counter_regs, *ctrl_regs;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+		return false;
+
+	if (k7_counters_mirrored &&
+	    ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+		msr = get_fam15h_addr(msr);
+
+	ctxt = &xenpmu_data->pmu.c.amd;
+	for (i = 0; i < amd_num_counters; i++) {
+		if (msr == amd_ctrls_base + off) {
+			ctrl_regs = field_offset(ctxt, ctrls);
+			reg = &ctrl_regs[i];
+			break;
+		} else if (msr == amd_counters_base + off) {
+			counter_regs = field_offset(ctxt, counters);
+			reg = &counter_regs[i];
+			break;
+		}
+		off += amd_msr_step;
+	}
+
+	if (reg) {
+		if (is_read)
+			*val = *reg;
+		else
+			*reg = *val;
+
+		return true;
+	}
+	return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		if (is_amd_pmu_msr(msr)) {
+			if (!xen_amd_pmu_emulate(msr, val, 1))
+				*val = native_read_msr_safe(msr, err);
+			return true;
+		}
+	} else {
+		int type, index;
+
+		if (is_intel_pmu_msr(msr, &type, &index)) {
+			if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+				*val = native_read_msr_safe(msr, err);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+	uint64_t val = ((uint64_t)high << 32) | low;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		if (is_amd_pmu_msr(msr)) {
+			if (!xen_amd_pmu_emulate(msr, &val, 0))
+				*err = native_write_msr_safe(msr, low, high);
+			return true;
+		}
+	} else {
+		int type, index;
+
+		if (is_intel_pmu_msr(msr, &type, &index)) {
+			if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+				*err = native_write_msr_safe(msr, low, high);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+	struct xen_pmu_amd_ctxt *ctxt;
+	uint64_t *counter_regs;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+		uint32_t msr;
+		int err;
+
+		msr = amd_counters_base + (counter * amd_msr_step);
+		return native_read_msr_safe(msr, &err);
+	}
+
+	ctxt = &xenpmu_data->pmu.c.amd;
+	counter_regs = field_offset(ctxt, counters);
+	return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+	struct xen_pmu_intel_ctxt *ctxt;
+	uint64_t *fixed_counters;
+	struct xen_pmu_cntr_pair *arch_cntr_pair;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+		uint32_t msr;
+		int err;
+
+		if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+			msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+		else
+			msr = MSR_IA32_PERFCTR0 + counter;
+
+		return native_read_msr_safe(msr, &err);
+	}
+
+	ctxt = &xenpmu_data->pmu.c.intel;
+	if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+		fixed_counters = field_offset(ctxt, fixed_counters);
+		return fixed_counters[counter & 0xffff];
+	}
+
+	arch_cntr_pair = field_offset(ctxt, arch_counters);
+	return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return xen_amd_read_pmc(counter);
+	else
+		return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+	int ret;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return -EINVAL;
+	}
+
+	xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+	if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+		return 0;
+
+	ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+	return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+		return 0;
+
+	return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+		return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+	else
+		return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+	.is_in_guest            = xen_is_in_guest,
+	.is_user_mode           = xen_is_user_mode,
+	.get_guest_ip           = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+			     struct pt_regs *regs, uint64_t pmu_flags)
+{
+	regs->ip = xen_regs->ip;
+	regs->cs = xen_regs->cs;
+	regs->sp = xen_regs->sp;
+
+	if (pmu_flags & PMU_SAMPLE_PV) {
+		if (pmu_flags & PMU_SAMPLE_USER)
+			regs->cs |= 3;
+		else
+			regs->cs &= ~3;
+	} else {
+		if (xen_regs->cpl)
+			regs->cs |= 3;
+		else
+			regs->cs &= ~3;
+	}
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+	int err, ret = IRQ_NONE;
+	struct pt_regs regs;
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return ret;
+	}
+
+	this_cpu_ptr(&xenpmu_shared)->flags =
+		xenpmu_flags | XENPMU_IRQ_PROCESSING;
+	xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+			 xenpmu_data->pmu.pmu_flags);
+	if (x86_pmu.handle_irq(&regs))
+		ret = IRQ_HANDLED;
+
+	/* Write out cached context to HW */
+	err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+	this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+	if (err) {
+		pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+		return IRQ_NONE;
+	}
+
+	return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+	return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+	int err;
+	struct xen_pmu_params xp;
+	unsigned long pfn;
+	struct xen_pmu_data *xenpmu_data;
+
+	BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+	if (xen_hvm_domain())
+		return;
+
+	xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+	if (!xenpmu_data) {
+		pr_err("VPMU init: No memory\n");
+		return;
+	}
+	pfn = virt_to_pfn(xenpmu_data);
+
+	xp.val = pfn_to_mfn(pfn);
+	xp.vcpu = cpu;
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+	if (err)
+		goto fail;
+
+	per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+	per_cpu(xenpmu_shared, cpu).flags = 0;
+
+	if (cpu == 0) {
+		perf_register_guest_info_callbacks(&xen_guest_cbs);
+		xen_pmu_arch_init();
+	}
+
+	return;
+
+fail:
+	pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+		cpu, err);
+	free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+	struct xen_pmu_params xp;
+
+	if (xen_hvm_domain())
+		return;
+
+	xp.vcpu = cpu;
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+
+	(void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+	free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+	per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644
index 000000000000..af5f0ad94078
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 55f388ef481a..f5ef6746d47a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "vdso.h"
-#include "p2m.h"
 #include "mmu.h"
 
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
  * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO		(10)
 
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+	bool val = false;
+	char *arg;
+
+	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+	if (!arg)
+		return;
+
+	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+	if (!arg)
+		val = true;
+	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+		return;
+
+	xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+				     unsigned long n_pfns)
 {
 	int i;
 
+	/*
+	 * No need to check for zero size, should happen rarely and will only
+	 * write a new entry regarded to be unused due to zero size.
+	 */
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
 		/* Add new region. */
-		if (xen_extra_mem[i].size == 0) {
-			xen_extra_mem[i].start = start;
-			xen_extra_mem[i].size  = size;
+		if (xen_extra_mem[i].n_pfns == 0) {
+			xen_extra_mem[i].start_pfn = start_pfn;
+			xen_extra_mem[i].n_pfns = n_pfns;
 			break;
 		}
 		/* Append to existing region. */
-		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
-			xen_extra_mem[i].size += size;
+		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+		    start_pfn) {
+			xen_extra_mem[i].n_pfns += n_pfns;
 			break;
 		}
 	}
 	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
 		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-	memblock_reserve(start, size);
+	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+				     unsigned long n_pfns)
 {
 	int i;
-	phys_addr_t start_r, size_r;
+	unsigned long start_r, size_r;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		start_r = xen_extra_mem[i].start;
-		size_r = xen_extra_mem[i].size;
+		start_r = xen_extra_mem[i].start_pfn;
+		size_r = xen_extra_mem[i].n_pfns;
 
 		/* Start of region. */
-		if (start_r == start) {
-			BUG_ON(size > size_r);
-			xen_extra_mem[i].start += size;
-			xen_extra_mem[i].size -= size;
+		if (start_r == start_pfn) {
+			BUG_ON(n_pfns > size_r);
+			xen_extra_mem[i].start_pfn += n_pfns;
+			xen_extra_mem[i].n_pfns -= n_pfns;
 			break;
 		}
 		/* End of region. */
-		if (start_r + size_r == start + size) {
-			BUG_ON(size > size_r);
-			xen_extra_mem[i].size -= size;
+		if (start_r + size_r == start_pfn + n_pfns) {
+			BUG_ON(n_pfns > size_r);
+			xen_extra_mem[i].n_pfns -= n_pfns;
 			break;
 		}
 		/* Mid of region. */
-		if (start > start_r && start < start_r + size_r) {
-			BUG_ON(start + size > start_r + size_r);
-			xen_extra_mem[i].size = start - start_r;
+		if (start_pfn > start_r && start_pfn < start_r + size_r) {
+			BUG_ON(start_pfn + n_pfns > start_r + size_r);
+			xen_extra_mem[i].n_pfns = start_pfn - start_r;
 			/* Calling memblock_reserve() again is okay. */
-			xen_add_extra_mem(start + size, start_r + size_r -
-					  (start + size));
+			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+					  (start_pfn + n_pfns));
 			break;
 		}
 	}
-	memblock_free(start, size);
+	memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
 /*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
 {
 	int i;
-	phys_addr_t addr = PFN_PHYS(pfn);
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		if (addr >= xen_extra_mem[i].start &&
-		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+		if (pfn >= xen_extra_mem[i].start_pfn &&
+		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
 			return INVALID_P2M_ENTRY;
 	}
 
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
 	int i;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		if (!xen_extra_mem[i].size)
+		if (!xen_extra_mem[i].n_pfns)
 			continue;
-		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+		pfn_s = xen_extra_mem[i].start_pfn;
+		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
 		for (pfn = pfn_s; pfn < pfn_e; pfn++)
 			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 	}
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
  * This function updates min_pfn with the pfn found and returns
  * the size of that range or zero if not found.
  */
-static unsigned long __init xen_find_pfn_range(
-	const struct e820entry *list, size_t map_size,
-	unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 {
-	const struct e820entry *entry;
+	const struct e820entry *entry = xen_e820_map;
 	unsigned int i;
 	unsigned long done = 0;
 
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
  * as a fallback if the remapping fails.
  */
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
-	unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+			unsigned long end_pfn, unsigned long nr_pages)
 {
 	unsigned long pfn, end;
 	int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
 
 		if (ret == 1) {
-			(*released)++;
+			xen_released_pages++;
 			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
 				break;
 		} else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
  * to Xen and not remapped.
  */
 static unsigned long __init xen_set_identity_and_remap_chunk(
-        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
-	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-	unsigned long *released, unsigned long *remapped)
+	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+	unsigned long remap_pfn)
 {
 	unsigned long pfn;
 	unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 		if (cur_pfn + size > nr_pages)
 			size = nr_pages - cur_pfn;
 
-		remap_range_size = xen_find_pfn_range(list, map_size,
-						      &remap_pfn);
+		remap_range_size = xen_find_pfn_range(&remap_pfn);
 		if (!remap_range_size) {
 			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
 			xen_set_identity_and_release_chunk(cur_pfn,
-				cur_pfn + left, nr_pages, released);
+						cur_pfn + left, nr_pages);
 			break;
 		}
 		/* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 		/* Update variables to reflect new mappings. */
 		i += size;
 		remap_pfn += size;
-		*remapped += size;
 	}
 
 	/*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 	return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(
-	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
-	unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
 {
 	phys_addr_t start = 0;
 	unsigned long last_pfn = nr_pages;
-	const struct e820entry *entry;
-	unsigned long num_released = 0;
-	unsigned long num_remapped = 0;
+	const struct e820entry *entry = xen_e820_map;
 	int i;
 
 	/*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
 	 * example) the DMI tables in a reserved region that begins on
 	 * a non-page boundary.
 	 */
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		phys_addr_t end = entry->addr + entry->size;
-		if (entry->type == E820_RAM || i == map_size - 1) {
+		if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
 			unsigned long start_pfn = PFN_DOWN(start);
 			unsigned long end_pfn = PFN_UP(end);
 
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
 
 			if (start_pfn < end_pfn)
 				last_pfn = xen_set_identity_and_remap_chunk(
-						list, map_size, start_pfn,
-						end_pfn, nr_pages, last_pfn,
-						&num_released, &num_remapped);
+						start_pfn, end_pfn, nr_pages,
+						last_pfn);
 			start = end;
 		}
 	}
 
-	*released = num_released;
-	*remapped = num_remapped;
-
-	pr_info("Released %ld page(s)\n", num_released);
+	pr_info("Released %ld page(s)\n", xen_released_pages);
 }
 
 /*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
 		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
 			len += xen_remap_buf.size;
 		} else {
-			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+			xen_del_extra_mem(pfn_s, len);
 			pfn_s = xen_remap_buf.target_pfn;
 			len = xen_remap_buf.size;
 		}
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
 	}
 
 	if (pfn_s != ~0UL && len)
-		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+		xen_del_extra_mem(pfn_s, len);
 
 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
 
 	pr_info("Remapped %ld page(s)\n", remapped);
 }
 
+static unsigned long __init xen_get_pages_limit(void)
+{
+	unsigned long limit;
+
+#ifdef CONFIG_X86_32
+	limit = GB(64) / PAGE_SIZE;
+#else
+	limit = MAXMEM / PAGE_SIZE;
+	if (!xen_initial_domain() && xen_512gb_limit)
+		limit = GB(512) / PAGE_SIZE;
+#endif
+	return limit;
+}
+
 static unsigned long __init xen_get_max_pages(void)
 {
-	unsigned long max_pages = MAX_DOMAIN_PAGES;
+	unsigned long max_pages, limit;
 	domid_t domid = DOMID_SELF;
 	int ret;
 
+	limit = xen_get_pages_limit();
+	max_pages = limit;
+
 	/*
 	 * For the initial domain we use the maximum reservation as
 	 * the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
 			max_pages = ret;
 	}
 
-	return min(max_pages, MAX_DOMAIN_PAGES);
+	return min(max_pages, limit);
 }
 
 static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
 	e820_add_region(start, end - start, type);
 }
 
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
 {
-	struct e820entry *entry;
+	struct e820entry *entry = xen_e820_map;
 	unsigned int i;
 
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		if (entry->type == E820_UNUSABLE)
 			entry->type = E820_RAM;
 	}
 }
 
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+	unsigned long extra = 0;
+	unsigned long start_pfn, end_pfn;
+	const struct e820entry *entry = xen_e820_map;
+	int i;
+
+	end_pfn = 0;
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+		start_pfn = PFN_DOWN(entry->addr);
+		/* Adjacent regions on non-page boundaries handling! */
+		end_pfn = min(end_pfn, start_pfn);
+
+		if (start_pfn >= max_pfn)
+			return extra + max_pfn - end_pfn;
+
+		/* Add any holes in map to result. */
+		extra += start_pfn - end_pfn;
+
+		end_pfn = PFN_UP(entry->addr + entry->size);
+		end_pfn = min(end_pfn, max_pfn);
+
+		if (entry->type != E820_RAM)
+			extra += end_pfn - start_pfn;
+	}
+
+	return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+	struct e820entry *entry;
+	unsigned mapcnt;
+	phys_addr_t end;
+
+	if (!size)
+		return false;
+
+	end = start + size;
+	entry = xen_e820_map;
+
+	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+		if (entry->type == E820_RAM && entry->addr <= start &&
+		    (entry->addr + entry->size) >= end)
+			return false;
+
+		entry++;
+	}
+
+	return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+	unsigned mapcnt;
+	phys_addr_t addr, start;
+	struct e820entry *entry = xen_e820_map;
+
+	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+		if (entry->type != E820_RAM || entry->size < size)
+			continue;
+		start = entry->addr;
+		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+			if (!memblock_is_reserved(addr))
+				continue;
+			start = addr + PAGE_SIZE;
+			if (start + size > entry->addr + entry->size)
+				break;
+		}
+		if (addr >= start + size) {
+			memblock_reserve(start, size);
+			return start;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+				   phys_addr_t n)
+{
+	phys_addr_t dest_off, src_off, dest_len, src_len, len;
+	void *from, *to;
+
+	while (n) {
+		dest_off = dest & ~PAGE_MASK;
+		src_off = src & ~PAGE_MASK;
+		dest_len = n;
+		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+		src_len = n;
+		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+		len = min(dest_len, src_len);
+		to = early_memremap(dest - dest_off, dest_len + dest_off);
+		from = early_memremap(src - src_off, src_len + src_off);
+		memcpy(to, from, len);
+		early_memunmap(to, dest_len + dest_off);
+		early_memunmap(from, src_len + src_off);
+		n -= len;
+		dest += len;
+		src += len;
+	}
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+	phys_addr_t start, size;
+
+	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+		start = __pa(xen_start_info->mfn_list);
+		size = PFN_ALIGN(xen_start_info->nr_pages *
+				 sizeof(unsigned long));
+	} else {
+		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+	}
+
+	if (!xen_is_e820_reserved(start, size)) {
+		memblock_reserve(start, size);
+		return;
+	}
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Relocating the p2m on 32 bit system to an arbitrary virtual address
+	 * is not supported, so just give up.
+	 */
+	xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+	BUG();
+#else
+	xen_relocate_p2m();
+#endif
+}
+
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
-	static struct e820entry map[E820MAX] __initdata;
-
-	unsigned long max_pfn = xen_start_info->nr_pages;
-	phys_addr_t mem_end;
+	unsigned long max_pfn, pfn_s, n_pfns;
+	phys_addr_t mem_end, addr, size, chunk_size;
+	u32 type;
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
 	unsigned long extra_pages = 0;
-	unsigned long remapped_pages;
 	int i;
 	int op;
 
-	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+	xen_parse_512gb();
+	max_pfn = xen_get_pages_limit();
+	max_pfn = min(max_pfn, xen_start_info->nr_pages);
 	mem_end = PFN_PHYS(max_pfn);
 
 	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
+	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	op = xen_initial_domain() ?
 		XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
 	if (rc == -ENOSYS) {
 		BUG_ON(xen_initial_domain());
 		memmap.nr_entries = 1;
-		map[0].addr = 0ULL;
-		map[0].size = mem_end;
+		xen_e820_map[0].addr = 0ULL;
+		xen_e820_map[0].size = mem_end;
 		/* 8MB slack (to balance backend allocations). */
-		map[0].size += 8ULL << 20;
-		map[0].type = E820_RAM;
+		xen_e820_map[0].size += 8ULL << 20;
+		xen_e820_map[0].type = E820_RAM;
 		rc = 0;
 	}
 	BUG_ON(rc);
 	BUG_ON(memmap.nr_entries == 0);
+	xen_e820_map_entries = memmap.nr_entries;
 
 	/*
 	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
 	 * a patch in the future.
 	 */
 	if (xen_initial_domain())
-		xen_ignore_unusable(map, memmap.nr_entries);
+		xen_ignore_unusable();
 
 	/* Make sure the Xen-supplied memory map is well-ordered. */
-	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+	sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
+			  &xen_e820_map_entries);
 
 	max_pages = xen_get_max_pages();
-	if (max_pages > max_pfn)
-		extra_pages += max_pages - max_pfn;
 
-	/*
-	 * Set identity map on non-RAM pages and prepare remapping the
-	 * underlying RAM.
-	 */
-	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-				   &xen_released_pages, &remapped_pages);
+	/* How many extra pages do we need due to remapping? */
+	max_pages += xen_count_remap_pages(max_pfn);
 
-	extra_pages += xen_released_pages;
-	extra_pages += remapped_pages;
+	if (max_pages > max_pfn)
+		extra_pages += max_pages - max_pfn;
 
 	/*
 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
 	 * is limited to the max size of lowmem, so that it doesn't
 	 * get completely filled.
 	 *
+	 * Make sure we have no memory above max_pages, as this area
+	 * isn't handled by the p2m management.
+	 *
 	 * In principle there could be a problem in lowmem systems if
 	 * the initial memory is also very large with respect to
 	 * lowmem, but we won't try to deal with that here.
 	 */
-	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-			  extra_pages);
+	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+			   extra_pages, max_pages - max_pfn);
 	i = 0;
-	while (i < memmap.nr_entries) {
-		phys_addr_t addr = map[i].addr;
-		phys_addr_t size = map[i].size;
-		u32 type = map[i].type;
+	addr = xen_e820_map[0].addr;
+	size = xen_e820_map[0].size;
+	while (i < xen_e820_map_entries) {
+		chunk_size = size;
+		type = xen_e820_map[i].type;
 
 		if (type == E820_RAM) {
 			if (addr < mem_end) {
-				size = min(size, mem_end - addr);
+				chunk_size = min(size, mem_end - addr);
 			} else if (extra_pages) {
-				size = min(size, PFN_PHYS(extra_pages));
-				extra_pages -= PFN_DOWN(size);
-				xen_add_extra_mem(addr, size);
-				xen_max_p2m_pfn = PFN_DOWN(addr + size);
+				chunk_size = min(size, PFN_PHYS(extra_pages));
+				pfn_s = PFN_UP(addr);
+				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+				extra_pages -= n_pfns;
+				xen_add_extra_mem(pfn_s, n_pfns);
+				xen_max_p2m_pfn = pfn_s + n_pfns;
 			} else
 				type = E820_UNUSABLE;
 		}
 
-		xen_align_and_add_e820_region(addr, size, type);
+		xen_align_and_add_e820_region(addr, chunk_size, type);
 
-		map[i].addr += size;
-		map[i].size -= size;
-		if (map[i].size == 0)
+		addr += chunk_size;
+		size -= chunk_size;
+		if (size == 0) {
 			i++;
+			if (i < xen_e820_map_entries) {
+				addr = xen_e820_map[i].addr;
+				size = xen_e820_map[i].size;
+			}
+		}
 	}
 
 	/*
 	 * Set the rest as identity mapped, in case PCI BARs are
 	 * located here.
-	 *
-	 * PFNs above MAX_P2M_PFN are considered identity mapped as
-	 * well.
 	 */
-	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
 
 	/*
 	 * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
 
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
 	/*
-	 * Reserve Xen bits:
-	 *  - mfn_list
-	 *  - xen_start_info
-	 * See comment above "struct start_info" in <xen/interface/xen.h>
-	 * We tried to make the the memblock_reserve more selective so
-	 * that it would be clear what region is reserved. Sadly we ran
-	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
-	 * initial domain, the pt_base has the cr3 value which is not
-	 * neccessarily where the pagetable starts! As Jan put it: "
-	 * Actually, the adjustment turns out to be correct: The page
-	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
-	 * "first L2", "first L3", so the offset to the page table base is
-	 * indeed 2. When reading xen/include/public/xen.h's comment
-	 * very strictly, this is not a violation (since there nothing is said
-	 * that the first thing in the page table space is pointed to by
-	 * pt_base; I admit that this seems to be implied though, namely
-	 * do I think that it is implied that the page table space is the
-	 * range [pt_base, pt_base + nt_pt_frames), whereas that
-	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
-	 * which - without a priori knowledge - the kernel would have
-	 * difficulty to figure out)." - so lets just fall back to the
-	 * easy way and reserve the whole region.
+	 * Check whether the kernel itself conflicts with the target E820 map.
+	 * Failing now is better than running into weird problems later due
+	 * to relocating (and even reusing) pages with kernel text or data.
 	 */
-	memblock_reserve(__pa(xen_start_info->mfn_list),
-			 xen_start_info->pt_base - xen_start_info->mfn_list);
+	if (xen_is_e820_reserved(__pa_symbol(_text),
+			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+		BUG();
+	}
 
-	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	/*
+	 * Check for a conflict of the hypervisor supplied page tables with
+	 * the target E820 map.
+	 */
+	xen_pt_check_e820();
+
+	xen_reserve_xen_mfnlist();
+
+	/* Check for a conflict of the initrd with the target E820 map. */
+	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+				 boot_params.hdr.ramdisk_size)) {
+		phys_addr_t new_area, start, size;
+
+		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+		if (!new_area) {
+			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+			BUG();
+		}
+
+		start = boot_params.hdr.ramdisk_image;
+		size = boot_params.hdr.ramdisk_size;
+		xen_phys_memcpy(new_area, start, size);
+		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+			start, start + size, new_area, new_area + size);
+		memblock_free(start, size);
+		boot_params.hdr.ramdisk_image = new_area;
+		boot_params.ext_ramdisk_image = new_area >> 32;
+	}
+
+	/*
+	 * Set identity map on non-RAM pages and prepare remapping the
+	 * underlying RAM.
+	 */
+	xen_set_identity_and_remap(max_pfn);
 
 	return "Xen";
 }
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
  */
 char * __init xen_auto_xlated_memory_setup(void)
 {
-	static struct e820entry map[E820MAX] __initdata;
-
 	struct xen_memory_map memmap;
 	int i;
 	int rc;
 
 	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
+	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
 	if (rc < 0)
 		panic("No memory map (%d)\n", rc);
 
-	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+	xen_e820_map_entries = memmap.nr_entries;
+
+	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+			  &xen_e820_map_entries);
 
-	for (i = 0; i < memmap.nr_entries; i++)
-		e820_add_region(map[i].addr, map[i].size, map[i].type);
+	for (i = 0; i < xen_e820_map_entries; i++)
+		e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+				xen_e820_map[i].type);
 
-	memblock_reserve(__pa(xen_start_info->mfn_list),
-			 xen_start_info->pt_base - xen_start_info->mfn_list);
+	/* Remove p2m info, it is not needed. */
+	xen_start_info->mfn_list = 0;
+	xen_start_info->first_p2m_pfn = 0;
+	xen_start_info->nr_p2m_frames = 0;
 
 	return "Xen";
 }
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 86484384492e..3f4ebf0261f2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
 
 #include <asm/xen/interface.h>
 #include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 #include "smp.h"
+#include "pmu.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
 		kfree(per_cpu(xen_irq_work, cpu).name);
 		per_cpu(xen_irq_work, cpu).name = NULL;
 	}
+
+	if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+		unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+		per_cpu(xen_pmu_irq, cpu).irq = -1;
+		kfree(per_cpu(xen_pmu_irq, cpu).name);
+		per_cpu(xen_pmu_irq, cpu).name = NULL;
+	}
 };
 static int xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;
-	char *resched_name, *callfunc_name, *debug_name;
+	char *resched_name, *callfunc_name, *debug_name, *pmu_name;
 
 	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
 	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
 	per_cpu(xen_irq_work, cpu).irq = rc;
 	per_cpu(xen_irq_work, cpu).name = callfunc_name;
 
+	if (is_xen_pmu(cpu)) {
+		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+					     xen_pmu_irq_handler,
+					     IRQF_PERCPU|IRQF_NOBALANCING,
+					     pmu_name, NULL);
+		if (rc < 0)
+			goto fail;
+		per_cpu(xen_pmu_irq, cpu).irq = rc;
+		per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+	}
+
 	return 0;
 
  fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	set_cpu_sibling_map(0);
 
+	xen_pmu_init(0);
+
 	if (xen_smp_intr_init(0))
 		BUG();
 
@@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	}
 #endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
 		BUG();
 
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	if (rc)
 		return rc;
 
+	xen_pmu_init(cpu);
+
 	rc = xen_smp_intr_init(cpu);
 	if (rc)
 		return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
 		xen_smp_intr_free(cpu);
 		xen_uninit_lock_cpu(cpu);
 		xen_teardown_timer(cpu);
+		xen_pmu_finish(cpu);
 	}
 }
 
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 53b4c0811f4f..feddabdab448 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -11,6 +11,7 @@
 
 #include "xen-ops.h"
 #include "mmu.h"
+#include "pmu.h"
 
 static void xen_pv_pre_suspend(void)
 {
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
 
 void xen_arch_pre_suspend(void)
 {
-    if (xen_pv_domain())
-        xen_pv_pre_suspend();
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		xen_pmu_finish(cpu);
+
+	if (xen_pv_domain())
+		xen_pv_pre_suspend();
 }
 
 void xen_arch_post_suspend(int cancelled)
 {
-    if (xen_pv_domain())
-        xen_pv_post_suspend(cancelled);
-    else
-        xen_hvm_post_suspend(cancelled);
+	int cpu;
+
+	if (xen_pv_domain())
+		xen_pv_post_suspend(cancelled);
+	else
+		xen_hvm_post_suspend(cancelled);
+
+	for_each_online_cpu(cpu)
+		xen_pmu_init(cpu);
 }
 
 static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 55da33b1d51c..f1ba6a092854 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -274,30 +274,18 @@ static s64 get_abs_timeout(unsigned long delta)
 	return xen_clocksource_read() + delta;
 }
 
-static void xen_timerop_set_mode(enum clock_event_mode mode,
-				 struct clock_event_device *evt)
+static int xen_timerop_shutdown(struct clock_event_device *evt)
 {
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* unsupported */
-		WARN_ON(1);
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
-		break;
-	}
+	/* cancel timeout */
+	HYPERVISOR_set_timer_op(0);
+
+	return 0;
 }
 
 static int xen_timerop_set_next_event(unsigned long delta,
 				      struct clock_event_device *evt)
 {
-	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	WARN_ON(!clockevent_state_oneshot(evt));
 
 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 		BUG();
@@ -310,46 +298,39 @@ static int xen_timerop_set_next_event(unsigned long delta,
 }
 
 static const struct clock_event_device xen_timerop_clockevent = {
-	.name = "xen",
-	.features = CLOCK_EVT_FEAT_ONESHOT,
+	.name			= "xen",
+	.features		= CLOCK_EVT_FEAT_ONESHOT,
 
-	.max_delta_ns = 0xffffffff,
-	.min_delta_ns = TIMER_SLOP,
+	.max_delta_ns		= 0xffffffff,
+	.min_delta_ns		= TIMER_SLOP,
 
-	.mult = 1,
-	.shift = 0,
-	.rating = 500,
+	.mult			= 1,
+	.shift			= 0,
+	.rating			= 500,
 
-	.set_mode = xen_timerop_set_mode,
-	.set_next_event = xen_timerop_set_next_event,
+	.set_state_shutdown	= xen_timerop_shutdown,
+	.set_next_event		= xen_timerop_set_next_event,
 };
 
+static int xen_vcpuop_shutdown(struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
 
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+	    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+		BUG();
+
+	return 0;
+}
 
-static void xen_vcpuop_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
 {
 	int cpu = smp_processor_id();
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		WARN_ON(1);	/* unsupported */
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-			BUG();
-		break;
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+		BUG();
 
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
-		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-			BUG();
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
+	return 0;
 }
 
 static int xen_vcpuop_set_next_event(unsigned long delta,
@@ -359,7 +340,7 @@ static int xen_vcpuop_set_next_event(unsigned long delta,
 	struct vcpu_set_singleshot_timer single;
 	int ret;
 
-	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	WARN_ON(!clockevent_state_oneshot(evt));
 
 	single.timeout_abs_ns = get_abs_timeout(delta);
 	single.flags = VCPU_SSHOTTMR_future;
@@ -382,7 +363,8 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
 	.shift = 0,
 	.rating = 500,
 
-	.set_mode = xen_vcpuop_set_mode,
+	.set_state_shutdown = xen_vcpuop_shutdown,
+	.set_state_oneshot = xen_vcpuop_set_oneshot,
 	.set_next_event = xen_vcpuop_set_next_event,
 };
 
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 8afdfccf6086..b65f59a358a2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
 #else
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+	/* Map the p2m table to a 512GB-aligned user address. */
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2292721b1d10..1399423f3418 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
 
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
 
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
 void __init xen_inv_extra_mem(void);
 void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
 char * __init xen_memory_setup(void);
 char * xen_auto_xlated_memory_setup(void);
 void __init xen_arch_setup(void);