79 files changed, 4854 insertions, 2834 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 0dfee812811a..2f7672545fe9 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,19 @@ config GENERIC_TIME
 	bool
 	default y
 
+config CLOCKSOURCE_WATCHDOG
+	bool
+	default y
+
+config GENERIC_CLOCKEVENTS
+	bool
+	default y
+
+config GENERIC_CLOCKEVENTS_BROADCAST
+	bool
+	default y
+	depends on X86_LOCAL_APIC
+
 config LOCKDEP_SUPPORT
 	bool
 	default y
@@ -38,6 +51,10 @@ config MMU
 	bool
 	default y
 
+config ZONE_DMA
+	bool
+	default y
+
 config SBUS
 	bool
 
@@ -70,6 +87,8 @@ source "init/Kconfig"
 
 menu "Processor type and features"
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -199,6 +218,15 @@ config PARAVIRT
 	  However, when run without a hypervisor the kernel is
 	  theoretically slower.  If in doubt, say N.
 
+config VMI
+	bool "VMI Paravirt-ops support"
+	depends on PARAVIRT && !NO_HZ
+	default y
+	help
+	  VMI provides a paravirtualized interface to multiple hypervisors
+	  include VMware ESX server and Xen by connecting to a ROM module
+	  provided by the hypervisor.
+
 config ACPI_SRAT
 	bool
 	default y
@@ -1259,3 +1287,12 @@ config X86_TRAMPOLINE
 config KTIME_SCALAR
 	bool
 	default y
+
+config NO_IDLE_HZ
+	bool
+	depends on PARAVIRT
+	default y
+	help
+	  Switches the regular HZ timer off when the system is going idle.
+	  This helps a hypervisor detect that the Linux system is idle,
+	  reducing the overhead of idle systems.
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 2aecfba4ac4f..b99c0e2a4e63 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -226,11 +226,6 @@ config X86_CMPXCHG
 	depends on !M386
 	default y
 
-config X86_XADD
-	bool
-	depends on !M386
-	default y
-
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || X86_GENERIC
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index f68cc6f215f8..458bc1611933 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -87,7 +87,7 @@ config DOUBLEFAULT
 
 config DEBUG_PARAVIRT
 	bool "Enable some paravirtualization debugging"
-	default y
+	default n
 	depends on PARAVIRT && DEBUG_KERNEL
 	help
 	  Currently deliberately clobbers regs which are allowed to be
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index 881951ca03e1..ce4fda261aaf 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -11,6 +11,7 @@
 #include <endian.h>
 
 #define MAX_SHDRS 100
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 static Elf32_Ehdr ehdr;
 static Elf32_Shdr shdr[MAX_SHDRS];
 static Elf32_Sym  *symtab[MAX_SHDRS];
@@ -71,7 +72,7 @@ static const char *sym_type(unsigned type)
 #undef SYM_TYPE
 	};
 	const char *name = "unknown sym type name";
-	if (type < sizeof(type_name)/sizeof(type_name[0])) {
+	if (type < ARRAY_SIZE(type_name)) {
 		name = type_name[type];
 	}
 	return name;
@@ -87,7 +88,7 @@ static const char *sym_bind(unsigned bind)
 #undef SYM_BIND
 	};
 	const char *name = "unknown sym bind name";
-	if (bind < sizeof(bind_name)/sizeof(bind_name[0])) {
+	if (bind < ARRAY_SIZE(bind_name)) {
 		name = bind_name[bind];
 	}
 	return name;
@@ -104,7 +105,7 @@ static const char *sym_visibility(unsigned visibility)
 #undef SYM_VISIBILITY
 	};
 	const char *name = "unknown sym visibility name";
-	if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) {
+	if (visibility < ARRAY_SIZE(visibility_name)) {
 		name = visibility_name[visibility];
 	}
 	return name;
@@ -128,7 +129,7 @@ static const char *rel_type(unsigned type)
 #undef REL_TYPE
 	};
 	const char *name = "unknown type rel type name";
-	if (type < sizeof(type_name)/sizeof(type_name[0])) {
+	if (type < ARRAY_SIZE(type_name)) {
 		name = type_name[type];
 	}
 	return name;
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 5d80edfc61b7..5ae1e0bc8fd7 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.20-rc3
-# Fri Jan  5 11:54:46 2007
+# Linux kernel version: 2.6.20-git8
+# Tue Feb 13 11:25:18 2007
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
@@ -10,6 +10,7 @@ CONFIG_STACKTRACE_SUPPORT=y
 CONFIG_SEMAPHORE_SLEEPERS=y
 CONFIG_X86=y
 CONFIG_MMU=y
+CONFIG_ZONE_DMA=y
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_GENERIC_BUG=y
@@ -139,7 +140,6 @@ CONFIG_MPENTIUMIII=y
 # CONFIG_MVIAC3_2 is not set
 CONFIG_X86_GENERIC=y
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_XADD=y
 CONFIG_X86_L1_CACHE_SHIFT=7
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 # CONFIG_ARCH_HAS_ILOG2_U32 is not set
@@ -198,6 +198,7 @@ CONFIG_FLAT_NODE_MEM_MAP=y
 # CONFIG_SPARSEMEM_STATIC is not set
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_RESOURCES_64BIT=y
+CONFIG_ZONE_DMA_FLAG=1
 # CONFIG_HIGHPTE is not set
 # CONFIG_MATH_EMULATION is not set
 CONFIG_MTRR=y
@@ -211,6 +212,7 @@ CONFIG_HZ_250=y
 CONFIG_HZ=250
 # CONFIG_KEXEC is not set
 # CONFIG_CRASH_DUMP is not set
+CONFIG_PHYSICAL_START=0x100000
 # CONFIG_RELOCATABLE is not set
 CONFIG_PHYSICAL_ALIGN=0x100000
 # CONFIG_HOTPLUG_CPU is not set
@@ -229,13 +231,14 @@ CONFIG_PM_SYSFS_DEPRECATED=y
 # ACPI (Advanced Configuration and Power Interface) Support
 #
 CONFIG_ACPI=y
+CONFIG_ACPI_PROCFS=y
 CONFIG_ACPI_AC=y
 CONFIG_ACPI_BATTERY=y
 CONFIG_ACPI_BUTTON=y
-# CONFIG_ACPI_VIDEO is not set
 # CONFIG_ACPI_HOTKEY is not set
 CONFIG_ACPI_FAN=y
 # CONFIG_ACPI_DOCK is not set
+# CONFIG_ACPI_BAY is not set
 CONFIG_ACPI_PROCESSOR=y
 CONFIG_ACPI_THERMAL=y
 # CONFIG_ACPI_ASUS is not set
@@ -306,7 +309,6 @@ CONFIG_PCI_DIRECT=y
 CONFIG_PCI_MMCONFIG=y
 # CONFIG_PCIEPORTBUS is not set
 CONFIG_PCI_MSI=y
-# CONFIG_PCI_MULTITHREAD_PROBE is not set
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_HT_IRQ is not set
 CONFIG_ISA_DMA_API=y
@@ -347,6 +349,7 @@ CONFIG_UNIX=y
 CONFIG_XFRM=y
 # CONFIG_XFRM_USER is not set
 # CONFIG_XFRM_SUB_POLICY is not set
+# CONFIG_XFRM_MIGRATE is not set
 # CONFIG_NET_KEY is not set
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
@@ -446,6 +449,7 @@ CONFIG_STANDALONE=y
 CONFIG_PREVENT_FIRMWARE_BUILD=y
 CONFIG_FW_LOADER=y
 # CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
 # CONFIG_SYS_HYPERVISOR is not set
 
 #
@@ -514,6 +518,7 @@ CONFIG_BLK_DEV_IDECD=y
 # CONFIG_BLK_DEV_IDETAPE is not set
 # CONFIG_BLK_DEV_IDEFLOPPY is not set
 # CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_BLK_DEV_IDEACPI=y
 # CONFIG_IDE_TASK_IOCTL is not set
 
 #
@@ -546,6 +551,7 @@ CONFIG_BLK_DEV_AMD74XX=y
 # CONFIG_BLK_DEV_JMICRON is not set
 # CONFIG_BLK_DEV_SC1200 is not set
 CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_IT8213 is not set
 # CONFIG_BLK_DEV_IT821X is not set
 # CONFIG_BLK_DEV_NS87415 is not set
 # CONFIG_BLK_DEV_PDC202XX_OLD is not set
@@ -556,6 +562,7 @@ CONFIG_BLK_DEV_PIIX=y
 # CONFIG_BLK_DEV_SLC90E66 is not set
 # CONFIG_BLK_DEV_TRM290 is not set
 # CONFIG_BLK_DEV_VIA82CXXX is not set
+# CONFIG_BLK_DEV_TC86C001 is not set
 # CONFIG_IDE_ARM is not set
 CONFIG_BLK_DEV_IDEDMA=y
 # CONFIG_IDEDMA_IVB is not set
@@ -654,6 +661,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
 # Serial ATA (prod) and Parallel ATA (experimental) drivers
 #
 CONFIG_ATA=y
+# CONFIG_ATA_NONSTANDARD is not set
 CONFIG_SATA_AHCI=y
 CONFIG_SATA_SVW=y
 CONFIG_ATA_PIIX=y
@@ -669,6 +677,7 @@ CONFIG_SATA_SIL=y
 # CONFIG_SATA_ULI is not set
 CONFIG_SATA_VIA=y
 # CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
 CONFIG_SATA_INTEL_COMBINED=y
 # CONFIG_PATA_ALI is not set
 # CONFIG_PATA_AMD is not set
@@ -686,6 +695,7 @@ CONFIG_SATA_INTEL_COMBINED=y
 # CONFIG_PATA_HPT3X2N is not set
 # CONFIG_PATA_HPT3X3 is not set
 # CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
 # CONFIG_PATA_JMICRON is not set
 # CONFIG_PATA_TRIFLEX is not set
 # CONFIG_PATA_MARVELL is not set
@@ -738,9 +748,7 @@ CONFIG_IEEE1394=y
 # Subsystem Options
 #
 # CONFIG_IEEE1394_VERBOSEDEBUG is not set
-# CONFIG_IEEE1394_OUI_DB is not set
 # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
-# CONFIG_IEEE1394_EXPORT_FULL_API is not set
 
 #
 # Device Drivers
@@ -766,6 +774,11 @@ CONFIG_IEEE1394_RAWIO=y
 # CONFIG_I2O is not set
 
 #
+# Macintosh device drivers
+#
+# CONFIG_MAC_EMUMOUSEBTN is not set
+
+#
 # Network device support
 #
 CONFIG_NETDEVICES=y
@@ -832,6 +845,7 @@ CONFIG_8139TOO=y
 # CONFIG_SUNDANCE is not set
 # CONFIG_TLAN is not set
 # CONFIG_VIA_RHINE is not set
+# CONFIG_SC92031 is not set
 
 #
 # Ethernet (1000 Mbit)
@@ -854,11 +868,13 @@ CONFIG_SKY2=y
 CONFIG_TIGON3=y
 CONFIG_BNX2=y
 # CONFIG_QLA3XXX is not set
+# CONFIG_ATL1 is not set
 
 #
 # Ethernet (10000 Mbit)
 #
 # CONFIG_CHELSIO_T1 is not set
+# CONFIG_CHELSIO_T3 is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
 # CONFIG_MYRI10GE is not set
@@ -1089,6 +1105,7 @@ CONFIG_SOUND=y
 # Open Sound System
 #
 CONFIG_SOUND_PRIME=y
+CONFIG_OBSOLETE_OSS=y
 # CONFIG_SOUND_BT878 is not set
 # CONFIG_SOUND_ES1371 is not set
 CONFIG_SOUND_ICH=y
@@ -1102,6 +1119,7 @@ CONFIG_SOUND_ICH=y
 # HID Devices
 #
 CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
 
 #
 # USB support
@@ -1116,10 +1134,8 @@ CONFIG_USB=y
 # Miscellaneous USB options
 #
 CONFIG_USB_DEVICEFS=y
-# CONFIG_USB_BANDWIDTH is not set
 # CONFIG_USB_DYNAMIC_MINORS is not set
 # CONFIG_USB_SUSPEND is not set
-# CONFIG_USB_MULTITHREAD_PROBE is not set
 # CONFIG_USB_OTG is not set
 
 #
@@ -1129,9 +1145,11 @@ CONFIG_USB_EHCI_HCD=y
 # CONFIG_USB_EHCI_SPLIT_ISO is not set
 # CONFIG_USB_EHCI_ROOT_HUB_TT is not set
 # CONFIG_USB_EHCI_TT_NEWSCHED is not set
+# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
 # CONFIG_USB_ISP116X_HCD is not set
 CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
 CONFIG_USB_OHCI_LITTLE_ENDIAN=y
 CONFIG_USB_UHCI_HCD=y
 # CONFIG_USB_SL811_HCD is not set
@@ -1182,6 +1200,7 @@ CONFIG_USB_HID=y
 # CONFIG_USB_ATI_REMOTE2 is not set
 # CONFIG_USB_KEYSPAN_REMOTE is not set
 # CONFIG_USB_APPLETOUCH is not set
+# CONFIG_USB_GTCO is not set
 
 #
 # USB Imaging devices
@@ -1287,6 +1306,10 @@ CONFIG_USB_MON=y
 #
 
 #
+# Auxiliary Display support
+#
+
+#
 # Virtualization
 #
 # CONFIG_KVM is not set
@@ -1479,6 +1502,7 @@ CONFIG_UNUSED_SYMBOLS=y
 # CONFIG_DEBUG_FS is not set
 # CONFIG_HEADERS_CHECK is not set
 CONFIG_DEBUG_KERNEL=y
+# CONFIG_DEBUG_SHIRQ is not set
 CONFIG_LOG_BUF_SHIFT=18
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_SCHEDSTATS is not set
@@ -1487,7 +1511,6 @@ CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_RT_MUTEX_TESTER is not set
 # CONFIG_DEBUG_SPINLOCK is not set
 # CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_RWSEMS is not set
 # CONFIG_DEBUG_LOCK_ALLOC is not set
 # CONFIG_PROVE_LOCKING is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
@@ -1532,7 +1555,8 @@ CONFIG_CRC32=y
 # CONFIG_LIBCRC32C is not set
 CONFIG_ZLIB_INFLATE=y
 CONFIG_PLIST=y
-CONFIG_IOMAP_COPY=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c5..4ae3dcf1d2f0 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
-obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
+obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
@@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-y				+= sysenter.o vsyscall.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
-obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
 obj-$(CONFIG_VM86)		+= vm86.o
@@ -40,8 +39,9 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 
-# Make sure this is linked after any other paravirt_ops structs: see head.S
+obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
+obj-y				+= pcspeaker.o
 
 EXTRA_AFLAGS   := -traditional
 
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index cbcb2c27f48b..e5eb97a910ed 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -25,6 +25,7 @@
 
 #include <linux/init.h>
 #include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
 #include <linux/efi.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
@@ -66,7 +67,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return
 
 #define BAD_MADT_ENTRY(entry, end) (					    \
 		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
-		((acpi_table_entry_header *)entry)->length < sizeof(*entry))
+		((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
 
 #define PREFIX			"ACPI: "
 
@@ -79,7 +80,7 @@ int acpi_ioapic;
 int acpi_strict;
 EXPORT_SYMBOL(acpi_strict);
 
-acpi_interrupt_flags acpi_sci_flags __initdata;
+u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
 int acpi_use_timer_override __initdata;
@@ -92,11 +93,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 #warning ACPI uses CMPXCHG, i486 and later hardware
 #endif
 
-#define MAX_MADT_ENTRIES	256
-u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
-    {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
-EXPORT_SYMBOL(x86_acpiid_to_apicid);
-
 /* --------------------------------------------------------------------------
                               Boot-time Configuration
    -------------------------------------------------------------------------- */
@@ -166,30 +162,26 @@ char *__acpi_map_table(unsigned long phys, unsigned long size)
 
 #ifdef CONFIG_PCI_MMCONFIG
 /* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-struct acpi_table_mcfg_config *pci_mmcfg_config;
+struct acpi_mcfg_allocation *pci_mmcfg_config;
 int pci_mmcfg_config_num;
 
-int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
+int __init acpi_parse_mcfg(struct acpi_table_header *header)
 {
 	struct acpi_table_mcfg *mcfg;
 	unsigned long i;
 	int config_size;
 
-	if (!phys_addr || !size)
+	if (!header)
 		return -EINVAL;
 
-	mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
-	if (!mcfg) {
-		printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
-		return -ENODEV;
-	}
+	mcfg = (struct acpi_table_mcfg *)header;
 
 	/* how many config structures do we have */
 	pci_mmcfg_config_num = 0;
-	i = size - sizeof(struct acpi_table_mcfg);
-	while (i >= sizeof(struct acpi_table_mcfg_config)) {
+	i = header->length - sizeof(struct acpi_table_mcfg);
+	while (i >= sizeof(struct acpi_mcfg_allocation)) {
 		++pci_mmcfg_config_num;
-		i -= sizeof(struct acpi_table_mcfg_config);
+		i -= sizeof(struct acpi_mcfg_allocation);
 	};
 	if (pci_mmcfg_config_num == 0) {
 		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
@@ -204,9 +196,9 @@ int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
 		return -ENOMEM;
 	}
 
-	memcpy(pci_mmcfg_config, &mcfg->config, config_size);
+	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
-		if (mcfg->config[i].base_reserved) {
+		if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
 			printk(KERN_ERR PREFIX
 			       "MMCONFIG not in low 4GB of memory\n");
 			kfree(pci_mmcfg_config);
@@ -220,24 +212,24 @@ int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
 #endif				/* CONFIG_PCI_MMCONFIG */
 
 #ifdef CONFIG_X86_LOCAL_APIC
-static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
+static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
 	struct acpi_table_madt *madt = NULL;
 
-	if (!phys_addr || !size || !cpu_has_apic)
+	if (!cpu_has_apic)
 		return -EINVAL;
 
-	madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
+	madt = (struct acpi_table_madt *)table;
 	if (!madt) {
 		printk(KERN_WARNING PREFIX "Unable to map MADT\n");
 		return -ENODEV;
 	}
 
-	if (madt->lapic_address) {
-		acpi_lapic_addr = (u64) madt->lapic_address;
+	if (madt->address) {
+		acpi_lapic_addr = (u64) madt->address;
 
 		printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
-		       madt->lapic_address);
+		       madt->address);
 	}
 
 	acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
@@ -246,21 +238,17 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
 }
 
 static int __init
-acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_lapic *processor = NULL;
+	struct acpi_madt_local_apic *processor = NULL;
 
-	processor = (struct acpi_table_lapic *)header;
+	processor = (struct acpi_madt_local_apic *)header;
 
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
 	acpi_table_print_madt_entry(header);
 
-	/* Record local apic id only when enabled */
-	if (processor->flags.enabled)
-		x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
-
 	/*
 	 * We need to register disabled CPU as well to permit
 	 * counting disabled CPUs. This allows us to size
@@ -269,18 +257,18 @@ acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
 	 * when we use CPU hotplug.
 	 */
 	mp_register_lapic(processor->id,	/* APIC ID */
-			  processor->flags.enabled);	/* Enabled? */
+			  processor->lapic_flags & ACPI_MADT_ENABLED);	/* Enabled? */
 
 	return 0;
 }
 
 static int __init
-acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 			  const unsigned long end)
 {
-	struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
+	struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
 
-	lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
+	lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
 
 	if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
 		return -EINVAL;
@@ -291,11 +279,11 @@ acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
 }
 
 static int __init
-acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_lapic_nmi *lapic_nmi = NULL;
+	struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
 
-	lapic_nmi = (struct acpi_table_lapic_nmi *)header;
+	lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
 
 	if (BAD_MADT_ENTRY(lapic_nmi, end))
 		return -EINVAL;
@@ -313,11 +301,11 @@ acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
 #ifdef CONFIG_X86_IO_APIC
 
 static int __init
-acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_ioapic *ioapic = NULL;
+	struct acpi_madt_io_apic *ioapic = NULL;
 
-	ioapic = (struct acpi_table_ioapic *)header;
+	ioapic = (struct acpi_madt_io_apic *)header;
 
 	if (BAD_MADT_ENTRY(ioapic, end))
 		return -EINVAL;
@@ -342,11 +330,11 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 		polarity = 3;
 
 	/* Command-line over-ride via acpi_sci= */
-	if (acpi_sci_flags.trigger)
-		trigger = acpi_sci_flags.trigger;
+	if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+		trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
 
-	if (acpi_sci_flags.polarity)
-		polarity = acpi_sci_flags.polarity;
+	if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
 	/*
 	 * mp_config_acpi_legacy_irqs() already setup IRQs < 16
@@ -357,51 +345,52 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 
 	/*
 	 * stash over-ride to indicate we've been here
-	 * and for later update of acpi_fadt
+	 * and for later update of acpi_gbl_FADT
 	 */
 	acpi_sci_override_gsi = gsi;
 	return;
 }
 
 static int __init
-acpi_parse_int_src_ovr(acpi_table_entry_header * header,
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 		       const unsigned long end)
 {
-	struct acpi_table_int_src_ovr *intsrc = NULL;
+	struct acpi_madt_interrupt_override *intsrc = NULL;
 
-	intsrc = (struct acpi_table_int_src_ovr *)header;
+	intsrc = (struct acpi_madt_interrupt_override *)header;
 
 	if (BAD_MADT_ENTRY(intsrc, end))
 		return -EINVAL;
 
 	acpi_table_print_madt_entry(header);
 
-	if (intsrc->bus_irq == acpi_fadt.sci_int) {
+	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
 		acpi_sci_ioapic_setup(intsrc->global_irq,
-				      intsrc->flags.polarity,
-				      intsrc->flags.trigger);
+				      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
 		return 0;
 	}
 
 	if (acpi_skip_timer_override &&
-	    intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
+	    intsrc->source_irq == 0 && intsrc->global_irq == 2) {
 		printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
 		return 0;
 	}
 
-	mp_override_legacy_irq(intsrc->bus_irq,
-			       intsrc->flags.polarity,
-			       intsrc->flags.trigger, intsrc->global_irq);
+	mp_override_legacy_irq(intsrc->source_irq,
+				intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+				(intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+				intsrc->global_irq);
 
 	return 0;
 }
 
 static int __init
-acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_nmi_src *nmi_src = NULL;
+	struct acpi_madt_nmi_source *nmi_src = NULL;
 
-	nmi_src = (struct acpi_table_nmi_src *)header;
+	nmi_src = (struct acpi_madt_nmi_source *)header;
 
 	if (BAD_MADT_ENTRY(nmi_src, end))
 		return -EINVAL;
@@ -417,7 +406,7 @@ acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
 
 /*
  * acpi_pic_sci_set_trigger()
- * 
+ *
  * use ELCR to set PIC-mode trigger type for SCI
  *
  * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
@@ -511,7 +500,7 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 {
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	union acpi_object *obj;
-	struct acpi_table_lapic *lapic;
+	struct acpi_madt_local_apic *lapic;
 	cpumask_t tmp_map, new_map;
 	u8 physid;
 	int cpu;
@@ -529,10 +518,10 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 		return -EINVAL;
 	}
 
-	lapic = (struct acpi_table_lapic *)obj->buffer.pointer;
+	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
 
-	if ((lapic->header.type != ACPI_MADT_LAPIC) ||
-	    (!lapic->flags.enabled)) {
+	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
 		kfree(buffer.pointer);
 		return -EINVAL;
 	}
@@ -544,7 +533,7 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	buffer.pointer = NULL;
 
 	tmp_map = cpu_present_map;
-	mp_register_lapic(physid, lapic->flags.enabled);
+	mp_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
 
 	/*
 	 * If mp_register_lapic successfully generates a new logical cpu
@@ -566,14 +555,6 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
-	int i;
-
-	for_each_possible_cpu(i) {
-		if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) {
-			x86_acpiid_to_apicid[i] = -1;
-			break;
-		}
-	}
 	x86_cpu_to_apicid[cpu] = -1;
 	cpu_clear(cpu, cpu_present_map);
 	num_processors--;
@@ -619,42 +600,37 @@ acpi_scan_rsdp(unsigned long start, unsigned long length)
 	return 0;
 }
 
-static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
 {
-	struct acpi_table_sbf *sb;
-
-	if (!phys_addr || !size)
-		return -EINVAL;
+	struct acpi_table_boot *sb;
 
-	sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
+	sb = (struct acpi_table_boot *)table;
 	if (!sb) {
 		printk(KERN_WARNING PREFIX "Unable to map SBF\n");
 		return -ENODEV;
 	}
 
-	sbf_port = sb->sbf_cmos;	/* Save CMOS port */
+	sbf_port = sb->cmos_index;	/* Save CMOS port */
 
 	return 0;
 }
 
 #ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
 
-static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
+static int __init acpi_parse_hpet(struct acpi_table_header *table)
 {
 	struct acpi_table_hpet *hpet_tbl;
 	struct resource *hpet_res;
 	resource_size_t res_start;
 
-	if (!phys || !size)
-		return -EINVAL;
-
-	hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
+	hpet_tbl = (struct acpi_table_hpet *)table;
 	if (!hpet_tbl) {
 		printk(KERN_WARNING PREFIX "Unable to map HPET\n");
 		return -ENODEV;
 	}
 
-	if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
+	if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
 		printk(KERN_WARNING PREFIX "HPET timers must be located in "
 		       "memory.\n");
 		return -1;
@@ -667,29 +643,15 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 		hpet_res->name = (void *)&hpet_res[1];
 		hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE,
-			 "HPET %u", hpet_tbl->number);
+			 "HPET %u", hpet_tbl->sequence);
 		hpet_res->end = (1 * 1024) - 1;
 	}
 
-#ifdef	CONFIG_X86_64
-	vxtime.hpet_address = hpet_tbl->addr.addrl |
-	    ((long)hpet_tbl->addr.addrh << 32);
-
+	hpet_address = hpet_tbl->address.address;
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-	       hpet_tbl->id, vxtime.hpet_address);
+	       hpet_tbl->id, hpet_address);
 
-	res_start = vxtime.hpet_address;
-#else				/* X86 */
-	{
-		extern unsigned long hpet_address;
-
-		hpet_address = hpet_tbl->addr.addrl;
-		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-		       hpet_tbl->id, hpet_address);
-
-		res_start = hpet_address;
-	}
-#endif				/* X86 */
+	res_start = hpet_address;
 
 	if (hpet_res) {
 		hpet_res->start = res_start;
@@ -703,46 +665,28 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 #define	acpi_parse_hpet	NULL
 #endif
 
-#ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
-#endif
-
-static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
 {
-	struct fadt_descriptor *fadt = NULL;
-
-	fadt = (struct fadt_descriptor *)__acpi_map_table(phys, size);
-	if (!fadt) {
-		printk(KERN_WARNING PREFIX "Unable to map FADT\n");
-		return 0;
-	}
-	/* initialize sci_int early for INT_SRC_OVR MADT parsing */
-	acpi_fadt.sci_int = fadt->sci_int;
-
-	/* initialize rev and apic_phys_dest_mode for x86_64 genapic */
-	acpi_fadt.revision = fadt->revision;
-	acpi_fadt.force_apic_physical_destination_mode =
-	    fadt->force_apic_physical_destination_mode;
 
 #ifdef CONFIG_X86_PM_TIMER
 	/* detect the location of the ACPI PM Timer */
-	if (fadt->revision >= FADT2_REVISION_ID) {
+	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
 		/* FADT rev. 2 */
-		if (fadt->xpm_tmr_blk.address_space_id !=
+		if (acpi_gbl_FADT.xpm_timer_block.space_id !=
 		    ACPI_ADR_SPACE_SYSTEM_IO)
 			return 0;
 
-		pmtmr_ioport = fadt->xpm_tmr_blk.address;
+		pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
 		/*
 		 * "X" fields are optional extensions to the original V1.0
 		 * fields, so we must selectively expand V1.0 fields if the
 		 * corresponding X field is zero.
 	 	 */
 		if (!pmtmr_ioport)
-			pmtmr_ioport = fadt->V1_pm_tmr_blk;
+			pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
 	} else {
 		/* FADT rev. 1 */
-		pmtmr_ioport = fadt->V1_pm_tmr_blk;
+		pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
 	}
 	if (pmtmr_ioport)
 		printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
@@ -784,13 +728,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	if (!cpu_has_apic)
 		return -ENODEV;
 
-	/* 
+	/*
 	 * Note that the LAPIC address is obtained from the MADT (32-bit value)
 	 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
 	 */
 
 	count =
-	    acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
 				  acpi_parse_lapic_addr_ovr, 0);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
@@ -800,7 +744,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 
 	mp_register_lapic_address(acpi_lapic_addr);
 
-	count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic,
 				      MAX_APICS);
 	if (!count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -813,7 +757,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	}
 
 	count =
-	    acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
@@ -842,7 +786,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 		return -ENODEV;
 	}
 
-	if (!cpu_has_apic) 
+	if (!cpu_has_apic)
 		return -ENODEV;
 
 	/*
@@ -855,7 +799,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	}
 
 	count =
-	    acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
 				  MAX_IO_APICS);
 	if (!count) {
 		printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
@@ -866,7 +810,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	}
 
 	count =
-	    acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
 				  NR_IRQ_VECTORS);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
@@ -880,13 +824,13 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	 * pretend we got one so we can set the SCI flags.
 	 */
 	if (!acpi_sci_override_gsi)
-		acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
+		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
 
 	/* Fill in identity legacy mapings where no override */
 	mp_config_acpi_legacy_irqs();
 
 	count =
-	    acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
 				  NR_IRQ_VECTORS);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
@@ -906,10 +850,9 @@ static inline int acpi_parse_madt_ioapic_entries(void)
 static void __init acpi_process_madt(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
-	int count, error;
+	int error;
 
-	count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
-	if (count >= 1) {
+	if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
 
 		/*
 		 * Parse MADT LAPIC entries
@@ -1195,7 +1138,7 @@ int __init acpi_boot_table_init(void)
 	if (acpi_disabled && !acpi_ht)
 		return 1;
 
-	/* 
+	/*
 	 * Initialize the ACPI boot-time table parser.
 	 */
 	error = acpi_table_init();
@@ -1204,7 +1147,7 @@ int __init acpi_boot_table_init(void)
 		return error;
 	}
 
-	acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
+	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
 
 	/*
 	 * blacklist may disable ACPI entirely
@@ -1232,19 +1175,19 @@ int __init acpi_boot_init(void)
 	if (acpi_disabled && !acpi_ht)
 		return 1;
 
-	acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
+	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
 
 	/*
 	 * set sci_int and PM timer address
 	 */
-	acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
+	acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
 
 	/*
 	 * Process the Multiple APIC Description Table (MADT), if present
 	 */
 	acpi_process_madt();
 
-	acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
+	acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
 
 	return 0;
 }
@@ -1315,13 +1258,17 @@ static int __init setup_acpi_sci(char *s)
 	if (!s)
 		return -EINVAL;
 	if (!strcmp(s, "edge"))
-		acpi_sci_flags.trigger = 1;
+		acpi_sci_flags =  ACPI_MADT_TRIGGER_EDGE |
+			(acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
 	else if (!strcmp(s, "level"))
-		acpi_sci_flags.trigger = 3;
+		acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
+			(acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
 	else if (!strcmp(s, "high"))
-		acpi_sci_flags.polarity = 1;
+		acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
+			(acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
 	else if (!strcmp(s, "low"))
-		acpi_sci_flags.polarity = 3;
+		acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
+			(acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
 	else
 		return -EINVAL;
 	return 0;
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 4b60af7f91dd..bf86f7662d8b 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -16,7 +16,7 @@
 
 static int nvidia_hpet_detected __initdata;
 
-static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
 {
 	nvidia_hpet_detected = 1;
 	return 0;
@@ -30,7 +30,7 @@ static int __init check_bridge(int vendor, int device)
 	   is enabled. */
 	if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
 		nvidia_hpet_detected = 0;
-		acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
+		acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check);
 		if (nvidia_hpet_detected == 0) {
 			acpi_skip_timer_override = 1;
 			  printk(KERN_INFO "Nvidia board "
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af9..9655c233e6f1 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -25,6 +25,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
 #include <linux/module.h>
 
 #include <asm/atomic.h>
@@ -36,6 +38,7 @@
 #include <asm/hpet.h>
 #include <asm/i8253.h>
 #include <asm/nmi.h>
+#include <asm/idle.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
@@ -44,128 +47,549 @@
 #include "io_ports.h"
 
 /*
- * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
- * IPIs in place of local APIC timers
+ * Sanity check
  */
-static cpumask_t timer_bcast_ipi;
+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
 
 /*
  * Knob to control our willingness to enable the local APIC.
+ *
+ * -1=force-disable, +1=force-enable
  */
-static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
-
-static inline void lapic_disable(void)
-{
-	enable_local_apic = -1;
-	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-}
+static int enable_local_apic __initdata = 0;
 
-static inline void lapic_enable(void)
-{
-	enable_local_apic = 1;
-}
+/* Local APIC timer verification ok */
+static int local_apic_timer_verify_ok;
 
 /*
- * Debug level
+ * Debug level, exported for io_apic.c
  */
 int apic_verbosity;
 
+static unsigned int calibration_result;
 
+static int lapic_next_event(unsigned long delta,
+			    struct clock_event_device *evt);
+static void lapic_timer_setup(enum clock_event_mode mode,
+			      struct clock_event_device *evt);
+static void lapic_timer_broadcast(cpumask_t mask);
 static void apic_pm_activate(void);
 
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+	.name		= "lapic",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+	.shift		= 32,
+	.set_mode	= lapic_timer_setup,
+	.set_next_event	= lapic_next_event,
+	.broadcast	= lapic_timer_broadcast,
+	.rating		= 100,
+	.irq		= -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+	return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+	return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
 static int modern_apic(void)
 {
-	unsigned int lvr, version;
 	/* AMD systems use old APIC versions, so check the CPU */
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 >= 0xf)
+	    boot_cpu_data.x86 >= 0xf)
 		return 1;
-	lvr = apic_read(APIC_LVR);
-	version = GET_APIC_VERSION(lvr);
-	return version >= 0x14;
+	return lapic_get_version() >= 0x14;
 }
 
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0 (void * dummy)
+{
+	unsigned int v = APIC_DM_NMI;
+
+	/* Level triggered for 82489DX */
+	if (!lapic_is_integrated())
+		v |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT0, v);
+}
+
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+	return modern_apic() ? 0xff : 0xf;
+}
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+	unsigned int v = apic_read(APIC_LVR);
+
+	/* 82489DXs do not report # of LVT entries. */
+	return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor is set to 16 */
+#define APIC_DIVISOR 16
+
 /*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
  */
-void ack_bad_irq(unsigned int irq)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
-	printk("unexpected IRQ trap at vector %02x\n", irq);
+	unsigned int lvtt_value, tmp_value;
+
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+	if (!lapic_is_integrated())
+		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+	if (!irqen)
+		lvtt_value |= APIC_LVT_MASKED;
+
+	apic_write_around(APIC_LVTT, lvtt_value);
+
 	/*
-	 * Currently unexpected vectors happen only on SMP and APIC.
-	 * We _must_ ack these because every local APIC has only N
-	 * irq slots per priority level, and a 'hanging, unacked' IRQ
-	 * holds up an irq slot - in excessive cases (when multiple
-	 * unexpected vectors occur) that might lock up the APIC
-	 * completely.
-	 * But only ack when the APIC is enabled -AK
+	 * Divide PICLK by 16
 	 */
-	if (cpu_has_apic)
-		ack_APIC_irq();
+	tmp_value = apic_read(APIC_TDCR);
+	apic_write_around(APIC_TDCR, (tmp_value
+				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+				| APIC_TDR_DIV_16);
+
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-void __init apic_intr_init(void)
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+			    struct clock_event_device *evt)
+{
+	apic_write_around(APIC_TMICT, delta);
+	return 0;
+}
+
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+			      struct clock_event_device *evt)
+{
+	unsigned long flags;
+	unsigned int v;
+
+	/* Lapic used for broadcast ? */
+	if (!local_apic_timer_verify_ok)
+		return;
+
+	local_irq_save(flags);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		__setup_APIC_LVTT(calibration_result,
+				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		v = apic_read(APIC_LVTT);
+		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write_around(APIC_LVTT, v);
+		break;
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(cpumask_t mask)
 {
 #ifdef CONFIG_SMP
-	smp_intr_init();
+	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
 #endif
-	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+}
 
-	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __devinit setup_APIC_timer(void)
+{
+	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
-	/* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
+	memcpy(levt, &lapic_clockevent, sizeof(*levt));
+	levt->cpumask = cpumask_of_cpu(smp_processor_id());
+
+	clockevents_register_device(levt);
 }
 
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer __read_mostly = 0;
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS		(HZ/10)
 
-static int enabled_via_apicbase;
+static __initdata volatile int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
 
-void enable_NMI_through_LVT0 (void * dummy)
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
 {
-	unsigned int v, ver;
+	unsigned long long tsc = 0;
+	long tapic = apic_read(APIC_TMCCT);
+	unsigned long pm = acpi_pm_read_early();
 
-	ver = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(ver);
-	v = APIC_DM_NMI;			/* unmask and set to NMI */
-	if (!APIC_INTEGRATED(ver))		/* 82489DX */
-		v |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT0, v);
+	if (cpu_has_tsc)
+		rdtscll(tsc);
+
+	switch (lapic_cal_loops++) {
+	case 0:
+		lapic_cal_t1 = tapic;
+		lapic_cal_tsc1 = tsc;
+		lapic_cal_pm1 = pm;
+		lapic_cal_j1 = jiffies;
+		break;
+
+	case LAPIC_CAL_LOOPS:
+		lapic_cal_t2 = tapic;
+		lapic_cal_tsc2 = tsc;
+		if (pm < lapic_cal_pm1)
+			pm += ACPI_PM_OVRRUN;
+		lapic_cal_pm2 = pm;
+		lapic_cal_j2 = jiffies;
+		break;
+	}
 }
 
-int get_physical_broadcast(void)
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
 {
-	if (modern_apic())
-		return 0xff;
-	else
-		return 0xf;
+	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+	const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
+	const long pm_thresh = pm_100ms/100;
+	void (*real_handler)(struct clock_event_device *dev);
+	unsigned long deltaj;
+	long delta, deltapm;
+
+	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+		    "calibrating APIC timer ...\n");
+
+	local_irq_disable();
+
+	/* Replace the global interrupt handler */
+	real_handler = global_clock_event->event_handler;
+	global_clock_event->event_handler = lapic_cal_handler;
+
+	/*
+	 * Setup the APIC counter to 1e9. There is no way the lapic
+	 * can underflow in the 100ms detection time frame
+	 */
+	__setup_APIC_LVTT(1000000000, 0, 0);
+
+	/* Let the interrupts run */
+	local_irq_enable();
+
+	while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
+
+	local_irq_disable();
+
+	/* Restore the real event handler */
+	global_clock_event->event_handler = real_handler;
+
+	/* Build delta t1-t2 as apic timer counts down */
+	delta = lapic_cal_t1 - lapic_cal_t2;
+	apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+	/* Check, if the PM timer is available */
+	deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+	if (deltapm) {
+		unsigned long mult;
+		u64 res;
+
+		mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+		if (deltapm > (pm_100ms - pm_thresh) &&
+		    deltapm < (pm_100ms + pm_thresh)) {
+			apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+		} else {
+			res = (((u64) deltapm) *  mult) >> 22;
+			do_div(res, 1000000);
+			printk(KERN_WARNING "APIC calibration not consistent "
+			       "with PM Timer: %ldms instead of 100ms\n",
+			       (long)res);
+			/* Correct the lapic counter value */
+			res = (((u64) delta ) * pm_100ms);
+			do_div(res, deltapm);
+			printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+			       "%lu (%ld)\n", (unsigned long) res, delta);
+			delta = (long) res;
+		}
+	}
+
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
+	calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+	apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+	apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+	apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+		    calibration_result);
+
+	if (cpu_has_tsc) {
+		delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+			    "%ld.%04ld MHz.\n",
+			    (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+			    (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+	}
+
+	apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+		    "%u.%04u MHz.\n",
+		    calibration_result / (1000000 / HZ),
+		    calibration_result % (1000000 / HZ));
+
+
+	apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+	/*
+	 * Setup the apic timer manually
+	 */
+	local_apic_timer_verify_ok = 1;
+	levt->event_handler = lapic_cal_handler;
+	lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+	lapic_cal_loops = -1;
+
+	/* Let the interrupts run */
+	local_irq_enable();
+
+	while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
+
+	local_irq_disable();
+
+	/* Stop the lapic timer */
+	lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+
+	local_irq_enable();
+
+	/* Jiffies delta */
+	deltaj = lapic_cal_j2 - lapic_cal_j1;
+	apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+	/* Check, if the PM timer is available */
+	deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+	local_apic_timer_verify_ok = 0;
+
+	if (deltapm) {
+		if (deltapm > (pm_100ms - pm_thresh) &&
+		    deltapm < (pm_100ms + pm_thresh)) {
+			apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+			/* Check, if the jiffies result is consistent */
+			if (deltaj < LAPIC_CAL_LOOPS-2 ||
+			    deltaj > LAPIC_CAL_LOOPS+2) {
+				/*
+				 * Not sure, what we can do about this one.
+				 * When high resultion timers are active
+				 * and the lapic timer does not stop in C3
+				 * we are fine. Otherwise more trouble might
+				 * be waiting. -- tglx
+				 */
+				printk(KERN_WARNING "Global event device %s "
+				       "has wrong frequency "
+				       "(%lu ticks instead of %d)\n",
+				       global_clock_event->name, deltaj,
+				       LAPIC_CAL_LOOPS);
+			}
+			local_apic_timer_verify_ok = 1;
+		}
+	} else {
+		/* Check, if the jiffies result is consistent */
+		if (deltaj >= LAPIC_CAL_LOOPS-2 &&
+		    deltaj <= LAPIC_CAL_LOOPS+2) {
+			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+			local_apic_timer_verify_ok = 1;
+		}
+	}
+
+	if (!local_apic_timer_verify_ok) {
+		printk(KERN_WARNING
+		       "APIC timer disabled due to verification failure.\n");
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() == 1)
+			return;
+	} else
+		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+	/* Setup the lapic or request the broadcast */
+	setup_APIC_timer();
+}
+
+void __devinit setup_secondary_APIC_clock(void)
+{
+	setup_APIC_timer();
 }
 
-int get_maxlvt(void)
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
 {
-	unsigned int v, ver, maxlvt;
+	int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
 
-	v = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(v);
-	/* 82489DXs do not report # of LVT entries. */
-	maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
-	return maxlvt;
+	/*
+	 * Normally we should not be here till LAPIC has been initialized but
+	 * in some cases like kdump, its possible that there is a pending LAPIC
+	 * timer interrupt from previous kernel's context and is delivered in
+	 * new kernel the moment interrupts are enabled.
+	 *
+	 * Interrupts are enabled early and LAPIC is setup much later, hence
+	 * its possible that when we get here evt->event_handler is NULL.
+	 * Check for event_handler being NULL and discard the interrupt as
+	 * spurious.
+	 */
+	if (!evt->event_handler) {
+		printk(KERN_WARNING
+		       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+		/* Switch it off */
+		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		return;
+	}
+
+	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
+	evt->event_handler(evt);
 }
 
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+
+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	exit_idle();
+	irq_enter();
+	local_apic_timer_interrupt();
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
 void clear_local_APIC(void)
 {
-	int maxlvt;
+	int maxlvt = lapic_get_maxlvt();
 	unsigned long v;
 
-	maxlvt = get_maxlvt();
-
 	/*
 	 * Masking an LVT entry can trigger a local APIC error
 	 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -189,7 +613,7 @@ void clear_local_APIC(void)
 		apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
 	}
 
-/* lets not touch this if we didn't frob it */
+	/* lets not touch this if we didn't frob it */
 #ifdef CONFIG_X86_MCE_P4THERMAL
 	if (maxlvt >= 5) {
 		v = apic_read(APIC_LVTTHMR);
@@ -211,85 +635,18 @@ void clear_local_APIC(void)
 	if (maxlvt >= 5)
 		apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
 #endif
-	v = GET_APIC_VERSION(apic_read(APIC_LVR));
-	if (APIC_INTEGRATED(v)) {	/* !82489DX */
-		if (maxlvt > 3)		/* Due to Pentium errata 3AP and 11AP. */
+	/* Integrated APIC (!82489DX) ? */
+	if (lapic_is_integrated()) {
+		if (maxlvt > 3)
+			/* Clear ESR due to Pentium errata 3AP and 11AP */
 			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
 	}
 }
 
-void __init connect_bsp_APIC(void)
-{
-	if (pic_mode) {
-		/*
-		 * Do not trust the local APIC being empty at bootup.
-		 */
-		clear_local_APIC();
-		/*
-		 * PIC mode, enable APIC mode in the IMCR, i.e.
-		 * connect BSP's local APIC to INT and NMI lines.
-		 */
-		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
-				"enabling APIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x01, 0x23);
-	}
-	enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
-	if (pic_mode) {
-		/*
-		 * Put the board back into PIC mode (has an effect
-		 * only on certain older boards).  Note that APIC
-		 * interrupts, including IPIs, won't work beyond
-		 * this point!  The only exception are INIT IPIs.
-		 */
-		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
-				"entering PIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x00, 0x23);
-	}
-	else {
-		/* Go back to Virtual Wire compatibility mode */
-		unsigned long value;
-
-		/* For the spurious interrupt use vector F, and enable it */
-		value = apic_read(APIC_SPIV);
-		value &= ~APIC_VECTOR_MASK;
-		value |= APIC_SPIV_APIC_ENABLED;
-		value |= 0xf;
-		apic_write_around(APIC_SPIV, value);
-
-		if (!virt_wire_setup) {
-			/* For LVT0 make it edge triggered, active high, external and enabled */
-			value = apic_read(APIC_LVT0);
-			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
-			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-			apic_write_around(APIC_LVT0, value);
-		}
-		else {
-			/* Disable LVT0 */
-			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-		}
-
-		/* For LVT1 make it edge triggered, active high, nmi and enabled */
-		value = apic_read(APIC_LVT1);
-		value &= ~(
-			APIC_MODE_MASK | APIC_SEND_PENDING |
-			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-		apic_write_around(APIC_LVT1, value);
-	}
-}
-
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
 void disable_local_APIC(void)
 {
 	unsigned long value;
@@ -304,8 +661,13 @@ void disable_local_APIC(void)
 	value &= ~APIC_SPIV_APIC_ENABLED;
 	apic_write_around(APIC_SPIV, value);
 
+	/*
+	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
+	 * restore the disabled state.
+	 */
 	if (enabled_via_apicbase) {
 		unsigned int l, h;
+
 		rdmsr(MSR_IA32_APICBASE, l, h);
 		l &= ~MSR_IA32_APICBASE_ENABLE;
 		wrmsr(MSR_IA32_APICBASE, l, h);
@@ -313,6 +675,28 @@ void disable_local_APIC(void)
 }
 
 /*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
+ * not power-off.  Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+	unsigned long flags;
+
+	if (!cpu_has_apic)
+		return;
+
+	local_irq_save(flags);
+	clear_local_APIC();
+
+	if (enabled_via_apicbase)
+		disable_local_APIC();
+
+	local_irq_restore(flags);
+}
+
+/*
  * This is to verify that we're looking at a real local APIC.
  * Check these against your board if the CPUs aren't getting
  * started for no apparent reason.
@@ -344,7 +728,7 @@ int __init verify_local_APIC(void)
 	reg1 = GET_APIC_VERSION(reg0);
 	if (reg1 == 0x00 || reg1 == 0xff)
 		return 0;
-	reg1 = get_maxlvt();
+	reg1 = lapic_get_maxlvt();
 	if (reg1 < 0x02 || reg1 == 0xff)
 		return 0;
 
@@ -367,10 +751,15 @@ int __init verify_local_APIC(void)
 	return 1;
 }
 
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
 void __init sync_Arb_IDs(void)
 {
-	/* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1
-	   And not needed on AMD */
+	/*
+	 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+	 * needed on AMD.
+	 */
 	if (modern_apic())
 		return;
 	/*
@@ -383,14 +772,12 @@ void __init sync_Arb_IDs(void)
 				| APIC_DM_INIT);
 }
 
-extern void __error_in_apic_c (void);
-
 /*
  * An initial setup of the virtual wire mode.
  */
 void __init init_bsp_APIC(void)
 {
-	unsigned long value, ver;
+	unsigned long value;
 
 	/*
 	 * Don't do the setup now if we have a SMP BIOS as the
@@ -399,9 +786,6 @@ void __init init_bsp_APIC(void)
 	if (smp_found_config || !cpu_has_apic)
 		return;
 
-	value = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(value);
-
 	/*
 	 * Do not trust the local APIC being empty at bootup.
 	 */
@@ -413,9 +797,10 @@ void __init init_bsp_APIC(void)
 	value = apic_read(APIC_SPIV);
 	value &= ~APIC_VECTOR_MASK;
 	value |= APIC_SPIV_APIC_ENABLED;
-	
+
 	/* This bit is reserved on P4/Xeon and should be cleared */
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15))
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    (boot_cpu_data.x86 == 15))
 		value &= ~APIC_SPIV_FOCUS_DISABLED;
 	else
 		value |= APIC_SPIV_FOCUS_DISABLED;
@@ -427,14 +812,17 @@ void __init init_bsp_APIC(void)
 	 */
 	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
 	value = APIC_DM_NMI;
-	if (!APIC_INTEGRATED(ver))		/* 82489DX */
+	if (!lapic_is_integrated())		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
 	apic_write_around(APIC_LVT1, value);
 }
 
+/**
+ * setup_local_APIC - setup the local APIC
+ */
 void __devinit setup_local_APIC(void)
 {
-	unsigned long oldvalue, value, ver, maxlvt;
+	unsigned long oldvalue, value, maxlvt, integrated;
 	int i, j;
 
 	/* Pound the ESR really hard over the head with a big hammer - mbligh */
@@ -445,11 +833,7 @@ void __devinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 
-	value = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(value);
-
-	if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
-		__error_in_apic_c();
+	integrated = lapic_is_integrated();
 
 	/*
 	 * Double-check whether this APIC is really registered.
@@ -520,13 +904,10 @@ void __devinit setup_local_APIC(void)
 	 * like LRU than MRU (the short-term load is more even across CPUs).
 	 * See also the comment in end_level_ioapic_irq().  --macro
 	 */
-#if 1
+
 	/* Enable focus processor (bit==0) */
 	value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
-	/* Disable focus processor (bit==1) */
-	value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
+
 	/*
 	 * Set spurious IRQ vector
 	 */
@@ -562,17 +943,18 @@ void __devinit setup_local_APIC(void)
 		value = APIC_DM_NMI;
 	else
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
-	if (!APIC_INTEGRATED(ver))		/* 82489DX */
+	if (!integrated)		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
 	apic_write_around(APIC_LVT1, value);
 
-	if (APIC_INTEGRATED(ver) && !esr_disable) {		/* !82489DX */
-		maxlvt = get_maxlvt();
+	if (integrated && !esr_disable) {		/* !82489DX */
+		maxlvt = lapic_get_maxlvt();
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
 			apic_write(APIC_ESR, 0);
 		oldvalue = apic_read(APIC_ESR);
 
-		value = ERROR_APIC_VECTOR;      // enables sending errors
+		/* enables sending errors */
+		value = ERROR_APIC_VECTOR;
 		apic_write_around(APIC_LVTERR, value);
 		/*
 		 * spec says clear errors after enabling vector.
@@ -585,207 +967,30 @@ void __devinit setup_local_APIC(void)
 				"vector: 0x%08lx  after: 0x%08lx\n",
 				oldvalue, value);
 	} else {
-		if (esr_disable)	
-			/* 
-			 * Something untraceble is creating bad interrupts on 
+		if (esr_disable)
+			/*
+			 * Something untraceble is creating bad interrupts on
 			 * secondary quads ... for the moment, just leave the
 			 * ESR disabled - we can't do anything useful with the
 			 * errors anyway - mbligh
 			 */
-			printk("Leaving ESR disabled.\n");
-		else 
-			printk("No ESR for 82489DX.\n");
+			printk(KERN_INFO "Leaving ESR disabled.\n");
+		else
+			printk(KERN_INFO "No ESR for 82489DX.\n");
 	}
 
+	/* Disable the local apic timer */
+	value = apic_read(APIC_LVTT);
+	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write_around(APIC_LVTT, value);
+
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
 
 /*
- * If Linux enabled the LAPIC against the BIOS default
- * disable it down before re-entering the BIOS on shutdown.
- * Otherwise the BIOS may get confused and not power-off.
- * Additionally clear all LVT entries before disable_local_APIC
- * for the case where Linux didn't enable the LAPIC.
+ * Detect and initialize APIC
  */
-void lapic_shutdown(void)
-{
-	unsigned long flags;
-
-	if (!cpu_has_apic)
-		return;
-
-	local_irq_save(flags);
-	clear_local_APIC();
-
-	if (enabled_via_apicbase)
-		disable_local_APIC();
-
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PM
-
-static struct {
-	int active;
-	/* r/w apic fields */
-	unsigned int apic_id;
-	unsigned int apic_taskpri;
-	unsigned int apic_ldr;
-	unsigned int apic_dfr;
-	unsigned int apic_spiv;
-	unsigned int apic_lvtt;
-	unsigned int apic_lvtpc;
-	unsigned int apic_lvt0;
-	unsigned int apic_lvt1;
-	unsigned int apic_lvterr;
-	unsigned int apic_tmict;
-	unsigned int apic_tdcr;
-	unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = get_maxlvt();
-
-	apic_pm_state.apic_id = apic_read(APIC_ID);
-	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	if (maxlvt >= 4)
-		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	if (maxlvt >= 5)
-		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-	
-	local_irq_save(flags);
-	disable_local_APIC();
-	local_irq_restore(flags);
-	return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-	unsigned int l, h;
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = get_maxlvt();
-
-	local_irq_save(flags);
-
-	/*
-	 * Make sure the APICBASE points to the right address
-	 *
-	 * FIXME! This will be wrong if we ever support suspend on
-	 * SMP! We'll need to do this as part of the CPU restore!
-	 */
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
-
-	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-	apic_write(APIC_ID, apic_pm_state.apic_id);
-	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	if (maxlvt >= 5)
-		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-	if (maxlvt >= 4)
-		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
-	set_kset_name("lapic"),
-	.resume		= lapic_resume,
-	.suspend	= lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-	.id	= 0,
-	.cls	= &lapic_sysclass,
-};
-
-static void __devinit apic_pm_activate(void)
-{
-	apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-	int error;
-
-	if (!cpu_has_apic)
-		return 0;
-	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
-	error = sysdev_class_register(&lapic_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic);
-	return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else	/* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif	/* CONFIG_PM */
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- */
-
-static int __init apic_set_verbosity(char *str)
-{
-	if (strcmp("debug", str) == 0)
-		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
-		apic_verbosity = APIC_VERBOSE;
-	return 1;
-}
-
-__setup("apic=", apic_set_verbosity);
-
 static int __init detect_init_APIC (void)
 {
 	u32 h, l, features;
@@ -797,7 +1002,7 @@ static int __init detect_init_APIC (void)
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
 		if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
-		    (boot_cpu_data.x86 == 15))	    
+		    (boot_cpu_data.x86 == 15))
 			break;
 		goto no_apic;
 	case X86_VENDOR_INTEL:
@@ -811,23 +1016,23 @@ static int __init detect_init_APIC (void)
 
 	if (!cpu_has_apic) {
 		/*
-		 * Over-ride BIOS and try to enable the local
-		 * APIC only if "lapic" specified.
+		 * Over-ride BIOS and try to enable the local APIC only if
+		 * "lapic" specified.
 		 */
 		if (enable_local_apic <= 0) {
-			printk("Local APIC disabled by BIOS -- "
+			printk(KERN_INFO "Local APIC disabled by BIOS -- "
 			       "you can enable it with \"lapic\"\n");
 			return -1;
 		}
 		/*
-		 * Some BIOSes disable the local APIC in the
-		 * APIC_BASE MSR. This can only be done in
-		 * software for Intel P6 or later and AMD K7
-		 * (Model > 1) or later.
+		 * Some BIOSes disable the local APIC in the APIC_BASE
+		 * MSR. This can only be done in software for Intel P6 or later
+		 * and AMD K7 (Model > 1) or later.
 		 */
 		rdmsr(MSR_IA32_APICBASE, l, h);
 		if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-			printk("Local APIC disabled by BIOS -- reenabling.\n");
+			printk(KERN_INFO
+			       "Local APIC disabled by BIOS -- reenabling.\n");
 			l &= ~MSR_IA32_APICBASE_BASE;
 			l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
 			wrmsr(MSR_IA32_APICBASE, l, h);
@@ -840,7 +1045,7 @@ static int __init detect_init_APIC (void)
 	 */
 	features = cpuid_edx(1);
 	if (!(features & (1 << X86_FEATURE_APIC))) {
-		printk("Could not enable APIC!\n");
+		printk(KERN_WARNING "Could not enable APIC!\n");
 		return -1;
 	}
 	set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
@@ -854,17 +1059,20 @@ static int __init detect_init_APIC (void)
 	if (nmi_watchdog != NMI_NONE)
 		nmi_watchdog = NMI_LOCAL_APIC;
 
-	printk("Found and enabled local APIC!\n");
+	printk(KERN_INFO "Found and enabled local APIC!\n");
 
 	apic_pm_activate();
 
 	return 0;
 
 no_apic:
-	printk("No local APIC present or hardware disabled\n");
+	printk(KERN_INFO "No local APIC present or hardware disabled\n");
 	return -1;
 }
 
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
 void __init init_apic_mappings(void)
 {
 	unsigned long apic_phys;
@@ -924,387 +1132,96 @@ fake_ioapic_page:
 }
 
 /*
- * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
- * per second. We assume that the caller has already set up the local
- * APIC.
- *
- * The APIC timer is not exactly sync with the external timer chip, it
- * closely follows bus clocks.
- */
-
-/*
- * The timer chip is already set up at HZ interrupts per second here,
- * but we do not accept timer interrupts yet. We only allow the BP
- * to calibrate.
- */
-static unsigned int __devinit get_8254_timer_count(void)
-{
-	unsigned long flags;
-
-	unsigned int count;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-
-	outb_p(0x00, PIT_MODE);
-	count = inb_p(PIT_CH0);
-	count |= inb_p(PIT_CH0) << 8;
-
-	spin_unlock_irqrestore(&i8253_lock, flags);
-
-	return count;
-}
-
-/* next tick in 8254 can be caught by catching timer wraparound */
-static void __devinit wait_8254_wraparound(void)
-{
-	unsigned int curr_count, prev_count;
-
-	curr_count = get_8254_timer_count();
-	do {
-		prev_count = curr_count;
-		curr_count = get_8254_timer_count();
-
-		/* workaround for broken Mercury/Neptune */
-		if (prev_count >= curr_count + 0x100)
-			curr_count = get_8254_timer_count();
-
-	} while (prev_count >= curr_count);
-}
-
-/*
- * Default initialization for 8254 timers. If we use other timers like HPET,
- * we override this later
- */
-void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound;
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
  */
-
-#define APIC_DIVISOR 16
-
-static void __setup_APIC_LVTT(unsigned int clocks)
+int __init APIC_init_uniprocessor (void)
 {
-	unsigned int lvtt_value, tmp_value, ver;
-	int cpu = smp_processor_id();
-
-	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
-	if (!APIC_INTEGRATED(ver))
-		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-
-	if (cpu_isset(cpu, timer_bcast_ipi))
-		lvtt_value |= APIC_LVT_MASKED;
+	if (enable_local_apic < 0)
+		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
 
-	apic_write_around(APIC_LVTT, lvtt_value);
+	if (!smp_found_config && !cpu_has_apic)
+		return -1;
 
 	/*
-	 * Divide PICLK by 16
+	 * Complain if the BIOS pretends there is one.
 	 */
-	tmp_value = apic_read(APIC_TDCR);
-	apic_write_around(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
-
-	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
-}
+	if (!cpu_has_apic &&
+	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+		       boot_cpu_physical_apicid);
+		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+		return -1;
+	}
 
-static void __devinit setup_APIC_timer(unsigned int clocks)
-{
-	unsigned long flags;
+	verify_local_APIC();
 
-	local_irq_save(flags);
+	connect_bsp_APIC();
 
 	/*
-	 * Wait for IRQ0's slice:
+	 * Hack: In case of kdump, after a crash, kernel might be booting
+	 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+	 * might be zero if read from MP tables. Get it from LAPIC.
 	 */
-	wait_timer_tick();
+#ifdef CONFIG_CRASH_DUMP
+	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#endif
+	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
 
-	__setup_APIC_LVTT(clocks);
+	setup_local_APIC();
 
-	local_irq_restore(flags);
+#ifdef CONFIG_X86_IO_APIC
+	if (smp_found_config)
+		if (!skip_ioapic_setup && nr_ioapics)
+			setup_IO_APIC();
+#endif
+	setup_boot_clock();
+
+	return 0;
 }
 
 /*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
+ * APIC command line parameters
  */
-
-static int __init calibrate_APIC_clock(void)
-{
-	unsigned long long t1 = 0, t2 = 0;
-	long tt1, tt2;
-	long result;
-	int i;
-	const int LOOPS = HZ/10;
-
-	apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
-
-	/*
-	 * Put whatever arbitrary (but long enough) timeout
-	 * value into the APIC clock, we just want to get the
-	 * counter running for calibration.
-	 */
-	__setup_APIC_LVTT(1000000000);
-
-	/*
-	 * The timer chip counts down to zero. Let's wait
-	 * for a wraparound to start exact measurement:
-	 * (the current tick might have been already half done)
-	 */
-
-	wait_timer_tick();
-
-	/*
-	 * We wrapped around just now. Let's start:
-	 */
-	if (cpu_has_tsc)
-		rdtscll(t1);
-	tt1 = apic_read(APIC_TMCCT);
-
-	/*
-	 * Let's wait LOOPS wraprounds:
-	 */
-	for (i = 0; i < LOOPS; i++)
-		wait_timer_tick();
-
-	tt2 = apic_read(APIC_TMCCT);
-	if (cpu_has_tsc)
-		rdtscll(t2);
-
-	/*
-	 * The APIC bus clock counter is 32 bits only, it
-	 * might have overflown, but note that we use signed
-	 * longs, thus no extra care needed.
-	 *
-	 * underflown to be exact, as the timer counts down ;)
-	 */
-
-	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
-	if (cpu_has_tsc)
-		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
-			"%ld.%04ld MHz.\n",
-			((long)(t2-t1)/LOOPS)/(1000000/HZ),
-			((long)(t2-t1)/LOOPS)%(1000000/HZ));
-
-	apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
-		"%ld.%04ld MHz.\n",
-		result/(1000000/HZ),
-		result%(1000000/HZ));
-
-	return result;
-}
-
-static unsigned int calibration_result;
-
-void __init setup_boot_APIC_clock(void)
-{
-	unsigned long flags;
-	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
-	using_apic_timer = 1;
-
-	local_irq_save(flags);
-
-	calibration_result = calibrate_APIC_clock();
-	/*
-	 * Now set up the timer for real.
-	 */
-	setup_APIC_timer(calibration_result);
-
-	local_irq_restore(flags);
-}
-
-void __devinit setup_secondary_APIC_clock(void)
-{
-	setup_APIC_timer(calibration_result);
-}
-
-void disable_APIC_timer(void)
-{
-	if (using_apic_timer) {
-		unsigned long v;
-
-		v = apic_read(APIC_LVTT);
-		/*
-		 * When an illegal vector value (0-15) is written to an LVT
-		 * entry and delivery mode is Fixed, the APIC may signal an
-		 * illegal vector error, with out regard to whether the mask
-		 * bit is set or whether an interrupt is actually seen on input.
-		 *
-		 * Boot sequence might call this function when the LVTT has
-		 * '0' vector value. So make sure vector field is set to
-		 * valid value.
-		 */
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write_around(APIC_LVTT, v);
-	}
-}
-
-void enable_APIC_timer(void)
+static int __init parse_lapic(char *arg)
 {
-	int cpu = smp_processor_id();
-
-	if (using_apic_timer &&
-	    !cpu_isset(cpu, timer_bcast_ipi)) {
-		unsigned long v;
-
-		v = apic_read(APIC_LVTT);
-		apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
-	}
+	enable_local_apic = 1;
+	return 0;
 }
+early_param("lapic", parse_lapic);
 
-void switch_APIC_timer_to_ipi(void *cpumask)
+static int __init parse_nolapic(char *arg)
 {
-	cpumask_t mask = *(cpumask_t *)cpumask;
-	int cpu = smp_processor_id();
-
-	if (cpu_isset(cpu, mask) &&
-	    !cpu_isset(cpu, timer_bcast_ipi)) {
-		disable_APIC_timer();
-		cpu_set(cpu, timer_bcast_ipi);
-	}
+	enable_local_apic = -1;
+	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	return 0;
 }
-EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
+early_param("nolapic", parse_nolapic);
 
-void switch_ipi_to_APIC_timer(void *cpumask)
+static int __init apic_set_verbosity(char *str)
 {
-	cpumask_t mask = *(cpumask_t *)cpumask;
-	int cpu = smp_processor_id();
-
-	if (cpu_isset(cpu, mask) &&
-	    cpu_isset(cpu, timer_bcast_ipi)) {
-		cpu_clear(cpu, timer_bcast_ipi);
-		enable_APIC_timer();
-	}
+	if (strcmp("debug", str) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", str) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	return 1;
 }
-EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
-
-#undef APIC_DIVISOR
 
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(void)
-{
-	profile_tick(CPU_PROFILING);
-#ifdef CONFIG_SMP
-	update_process_times(user_mode_vm(get_irq_regs()));
-#endif
+__setup("apic=", apic_set_verbosity);
 
-	/*
-	 * We take the 'long' return path, and there every subsystem
-	 * grabs the apropriate locks (kernel lock/ irq lock).
-	 *
-	 * we might want to decouple profiling from the 'long path',
-	 * and do the profiling totally in assembly.
-	 *
-	 * Currently this isn't too much of an issue (performance wise),
-	 * we can take more than 100K local irqs per second on a 100 MHz P5.
-	 */
-}
 
 /*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
+ * Local APIC interrupts
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	int cpu = smp_processor_id();
-
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	irq_enter();
-	smp_local_timer_interrupt();
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-#ifndef CONFIG_SMP
-static void up_apic_timer_interrupt_call(void)
-{
-	int cpu = smp_processor_id();
-
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-	smp_local_timer_interrupt();
-}
-#endif
-
-void smp_send_timer_broadcast_ipi(void)
-{
-	cpumask_t mask;
-
-	cpus_and(mask, cpu_online_map, timer_bcast_ipi);
-	if (!cpus_empty(mask)) {
-#ifdef CONFIG_SMP
-		send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#else
-		/*
-		 * We can directly call the apic timer interrupt handler
-		 * in UP case. Minus all irq related functions
-		 */
-		up_apic_timer_interrupt_call();
-#endif
-	}
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-fastcall void smp_spurious_interrupt(struct pt_regs *regs)
+void smp_spurious_interrupt(struct pt_regs *regs)
 {
 	unsigned long v;
 
+	exit_idle();
 	irq_enter();
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
@@ -1316,19 +1233,19 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
 		ack_APIC_irq();
 
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
-	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
-			smp_processor_id());
+	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
+	       "should never happen.\n", smp_processor_id());
 	irq_exit();
 }
 
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-
-fastcall void smp_error_interrupt(struct pt_regs *regs)
+void smp_error_interrupt(struct pt_regs *regs)
 {
 	unsigned long v, v1;
 
+	exit_idle();
 	irq_enter();
 	/* First tickle the hardware, only then report what went on. -- REW */
 	v = apic_read(APIC_ESR);
@@ -1348,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
 	   7: Illegal register address
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
-	        smp_processor_id(), v , v1);
+		smp_processor_id(), v , v1);
 	irq_exit();
 }
 
 /*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * Initialize APIC interrupts
  */
-int __init APIC_init_uniprocessor (void)
+void __init apic_intr_init(void)
 {
-	if (enable_local_apic < 0)
-		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+#ifdef CONFIG_SMP
+	smp_intr_init();
+#endif
+	/* self generated IPI for local APIC timer */
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-	if (!smp_found_config && !cpu_has_apic)
-		return -1;
+	/* IPI vectors for APIC spurious and error interrupts */
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
-	/*
-	 * Complain if the BIOS pretends there is one.
-	 */
-	if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
-		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-			boot_cpu_physical_apicid);
-		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-		return -1;
+	/* thermal monitor LVT interrupt */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+	if (pic_mode) {
+		/*
+		 * Do not trust the local APIC being empty at bootup.
+		 */
+		clear_local_APIC();
+		/*
+		 * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
+		 * local APIC to INT and NMI lines.
+		 */
+		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+				"enabling APIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x01, 0x23);
 	}
+	enable_apic_mode();
+}
 
-	verify_local_APIC();
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup:	indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+	if (pic_mode) {
+		/*
+		 * Put the board back into PIC mode (has an effect only on
+		 * certain older boards).  Note that APIC interrupts, including
+		 * IPIs, won't work beyond this point!  The only exception are
+		 * INIT IPIs.
+		 */
+		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+				"entering PIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x00, 0x23);
+	} else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
 
-	connect_bsp_APIC();
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
 
-	/*
-	 * Hack: In case of kdump, after a crash, kernel might be booting
-	 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
-	 * might be zero if read from MP tables. Get it from LAPIC.
-	 */
-#ifdef CONFIG_CRASH_DUMP
-	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-#endif
-	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+		if (!virt_wire_setup) {
+			/*
+			 * For LVT0 make it edge triggered, active high,
+			 * external and enabled
+			 */
+			value = apic_read(APIC_LVT0);
+			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+			apic_write_around(APIC_LVT0, value);
+		} else {
+			/* Disable LVT0 */
+			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+		}
 
-	setup_local_APIC();
+		/*
+		 * For LVT1 make it edge triggered, active high, nmi and
+		 * enabled
+		 */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
+}
 
-#ifdef CONFIG_X86_IO_APIC
-	if (smp_found_config)
-		if (!skip_ioapic_setup && nr_ioapics)
-			setup_IO_APIC();
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+	int active;
+	/* r/w apic fields */
+	unsigned int apic_id;
+	unsigned int apic_taskpri;
+	unsigned int apic_ldr;
+	unsigned int apic_dfr;
+	unsigned int apic_spiv;
+	unsigned int apic_lvtt;
+	unsigned int apic_lvtpc;
+	unsigned int apic_lvt0;
+	unsigned int apic_lvt1;
+	unsigned int apic_lvterr;
+	unsigned int apic_tmict;
+	unsigned int apic_tdcr;
+	unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+	int maxlvt;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	maxlvt = lapic_get_maxlvt();
+
+	apic_pm_state.apic_id = apic_read(APIC_ID);
+	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
-	setup_boot_APIC_clock();
 
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
 	return 0;
 }
 
-static int __init parse_lapic(char *arg)
+static int lapic_resume(struct sys_device *dev)
 {
-	lapic_enable();
+	unsigned int l, h;
+	unsigned long flags;
+	int maxlvt;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	maxlvt = lapic_get_maxlvt();
+
+	local_irq_save(flags);
+
+	/*
+	 * Make sure the APICBASE points to the right address
+	 *
+	 * FIXME! This will be wrong if we ever support suspend on
+	 * SMP! We'll need to do this as part of the CPU restore!
+	 */
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	l &= ~MSR_IA32_APICBASE_BASE;
+	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+	wrmsr(MSR_IA32_APICBASE, l, h);
+
+	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+	apic_write(APIC_ID, apic_pm_state.apic_id);
+	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	local_irq_restore(flags);
 	return 0;
 }
-early_param("lapic", parse_lapic);
 
-static int __init parse_nolapic(char *arg)
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct sysdev_class lapic_sysclass = {
+	set_kset_name("lapic"),
+	.resume		= lapic_resume,
+	.suspend	= lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+	.id	= 0,
+	.cls	= &lapic_sysclass,
+};
+
+static void __devinit apic_pm_activate(void)
 {
-	lapic_disable();
-	return 0;
+	apic_pm_state.active = 1;
 }
-early_param("nolapic", parse_nolapic);
 
+static int __init init_lapic_sysfs(void)
+{
+	int error;
+
+	if (!cpu_has_apic)
+		return 0;
+	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
+	error = sysdev_class_register(&lapic_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic);
+	return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else	/* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif	/* CONFIG_PM */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 199016927541..064bbf2861f4 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/miscdevice.h>
 #include <linux/apm_bios.h>
 #include <linux/init.h>
@@ -235,7 +236,6 @@
 
 #include "io_ports.h"
 
-extern unsigned long get_cmos_time(void);
 extern void machine_real_restart(unsigned char *, int);
 
 #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
@@ -1175,28 +1175,6 @@ out:
 	spin_unlock(&user_list_lock);
 }
 
-static void set_time(void)
-{
-	struct timespec ts;
-	if (got_clock_diff) {	/* Must know time zone in order to set clock */
-		ts.tv_sec = get_cmos_time() + clock_cmos_diff;
-		ts.tv_nsec = 0;
-		do_settimeofday(&ts);
-	} 
-}
-
-static void get_time_diff(void)
-{
-#ifndef CONFIG_APM_RTC_IS_GMT
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	got_clock_diff = 1;
-#endif
-}
-
 static void reinit_timer(void)
 {
 #ifdef INIT_TIMER_AFTER_SUSPEND
@@ -1236,19 +1214,6 @@ static int suspend(int vetoable)
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 
-	/* serialize with the timer interrupt */
-	write_seqlock(&xtime_lock);
-
-	/* protect against access to timer chip registers */
-	spin_lock(&i8253_lock);
-
-	get_time_diff();
-	/*
-	 * Irq spinlock must be dropped around set_system_power_state.
-	 * We'll undo any timer changes due to interrupts below.
-	 */
-	spin_unlock(&i8253_lock);
-	write_sequnlock(&xtime_lock);
 	local_irq_enable();
 
 	save_processor_state();
@@ -1257,7 +1222,6 @@ static int suspend(int vetoable)
 	restore_processor_state();
 
 	local_irq_disable();
-	set_time();
 	reinit_timer();
 
 	if (err == APM_NO_ERROR)
@@ -1287,11 +1251,6 @@ static void standby(void)
 
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
-	/* serialize with the timer interrupt */
-	write_seqlock(&xtime_lock);
-	/* If needed, notify drivers here */
-	get_time_diff();
-	write_sequnlock(&xtime_lock);
 	local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
@@ -1385,7 +1344,6 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				set_time();
 				device_resume();
 				pm_send_all(PM_RESUME, (void *)0);
 				queue_event(event, NULL);
@@ -1401,7 +1359,6 @@ static void check_events(void)
 			break;
 
 		case APM_UPDATE_TIME:
-			set_time();
 			break;
 
 		case APM_CRITICAL_SUSPEND:
@@ -1636,9 +1593,8 @@ static int do_open(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static int apm_get_info(char *buf, char **start, off_t fpos, int length)
+static int proc_apm_show(struct seq_file *m, void *v)
 {
-	char *		p;
 	unsigned short	bx;
 	unsigned short	cx;
 	unsigned short	dx;
@@ -1650,8 +1606,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
 	int             time_units     = -1;
 	char            *units         = "?";
 
-	p = buf;
-
 	if ((num_online_cpus() == 1) &&
 	    !(error = apm_get_power_status(&bx, &cx, &dx))) {
 		ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1659,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
 	      -1: Unknown
 	   8) min = minutes; sec = seconds */
 
-	p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+	seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
 		     driver_version,
 		     (apm_info.bios.version >> 8) & 0xff,
 		     apm_info.bios.version & 0xff,
@@ -1716,10 +1670,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
 		     percentage,
 		     time_units,
 		     units);
+	return 0;
+}
 
-	return p - buf;
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_apm_show, NULL);
 }
 
+static const struct file_operations apm_file_ops = {
+	.owner		= THIS_MODULE,
+	.open		= proc_apm_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int apm(void *unused)
 {
 	unsigned short	bx;
@@ -1894,7 +1860,7 @@ static int __init apm_setup(char *str)
 __setup("apm=", apm_setup);
 #endif
 
-static struct file_operations apm_bios_fops = {
+static const struct file_operations apm_bios_fops = {
 	.owner		= THIS_MODULE,
 	.read		= do_read,
 	.poll		= do_poll,
@@ -2341,9 +2307,9 @@ static int __init apm_init(void)
 	set_base(gdt[APM_DS >> 3],
 		 __va((unsigned long)apm_info.bios.dseg << 4));
 
-	apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
+	apm_proc = create_proc_entry("apm", 0, NULL);
 	if (apm_proc)
-		apm_proc->owner = THIS_MODULE;
+		apm_proc->proc_fops = &apm_file_ops;
 
 	kapmd_task = kthread_create(apm, NULL, "kapmd");
 	if (IS_ERR(kapmd_task)) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1b2f3cd33270..c37535163bfc 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
 	OFFSET(PT_EAX, pt_regs, eax);
 	OFFSET(PT_DS,  pt_regs, xds);
 	OFFSET(PT_ES,  pt_regs, xes);
-	OFFSET(PT_GS,  pt_regs, xgs);
+	OFFSET(PT_FS,  pt_regs, xfs);
 	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
 	OFFSET(PT_EIP, pt_regs, eip);
 	OFFSET(PT_CS,  pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 8a8bbdaaf38a..dcbbd0a8bfc2 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xgs = __KERNEL_PDA;
+	regs->xfs = __KERNEL_PDA;
 	return regs;
 }
 
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
 	.pcurrent = &init_task,
 };
 
-static inline void set_kernel_gs(void)
+static inline void set_kernel_fs(void)
 {
-	/* Set %gs for this CPU's PDA.  Memory clobber is to create a
+	/* Set %fs for this CPU's PDA.  Memory clobber is to create a
 	   barrier with respect to any PDA operations, so the compiler
 	   doesn't move any before here. */
-	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 }
 
 /* Initialize the CPU's GDT and PDA.  The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
 	   the boot CPU, this will transition from the boot gdt+pda to
 	   the real ones). */
 	load_gdt(cpu_gdt_descr);
-	set_kernel_gs();
+	set_kernel_fs();
 }
 
 /* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-	/* Clear %fs. */
-	asm volatile ("mov %0, %%fs" : : "r" (0));
+	/* Clear %gs. */
+	asm volatile ("mov %0, %%gs" : : "r" (0));
 
 	/* Clear all 6 debug registers: */
 	set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index 5299c5bf4454..6c52182ca323 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -217,6 +217,15 @@ config X86_LONGHAUL
 
 	  If in doubt, say N.
 
+config X86_E_POWERSAVER
+	tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
+	select CPU_FREQ_TABLE
+	depends on EXPERIMENTAL
+	help
+	  This adds the CPUFreq driver for VIA C7 processors.
+
+	  If in doubt, say N.
+
 comment "shared options"
 
 config X86_ACPI_CPUFREQ_PROC_INTF
diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile
index 8de3abe322a9..560f7760dae5 100644
--- a/arch/i386/kernel/cpu/cpufreq/Makefile
+++ b/arch/i386/kernel/cpu/cpufreq/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
 obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
 obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o
+obj-$(CONFIG_X86_E_POWERSAVER)		+= e_powersaver.o
 obj-$(CONFIG_ELAN_CPUFREQ)		+= elanfreq.o
 obj-$(CONFIG_SC520_CPUFREQ)		+= sc520_freq.o
 obj-$(CONFIG_X86_LONGRUN)		+= longrun.o  
diff --git a/arch/i386/kernel/cpu/cpufreq/e_powersaver.c b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
new file mode 100644
index 000000000000..f43d98e11cc7
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
@@ -0,0 +1,334 @@
+/*
+ *  Based on documentation provided by Dave Jones. Thanks!
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+
+#include <asm/msr.h>
+#include <asm/tsc.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+
+#define EPS_BRAND_C7M	0
+#define EPS_BRAND_C7	1
+#define EPS_BRAND_EDEN	2
+#define EPS_BRAND_C3	3
+
+struct eps_cpu_data {
+	u32 fsb;
+	struct cpufreq_frequency_table freq_table[];
+};
+
+static struct eps_cpu_data *eps_cpu[NR_CPUS];
+
+
+static unsigned int eps_get(unsigned int cpu)
+{
+	struct eps_cpu_data *centaur;
+	u32 lo, hi;
+
+	if (cpu)
+		return 0;
+	centaur = eps_cpu[cpu];
+	if (centaur == NULL)
+		return 0;
+
+	/* Return current frequency */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	return centaur->fsb * ((lo >> 8) & 0xff);
+}
+
+static int eps_set_state(struct eps_cpu_data *centaur,
+			 unsigned int cpu,
+			 u32 dest_state)
+{
+	struct cpufreq_freqs freqs;
+	u32 lo, hi;
+	int err = 0;
+	int i;
+
+	freqs.old = eps_get(cpu);
+	freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
+	freqs.cpu = cpu;
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	/* Wait while CPU is busy */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	i = 0;
+	while (lo & ((1 << 16) | (1 << 17))) {
+		udelay(16);
+		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+		i++;
+		if (unlikely(i > 64)) {
+			err = -ENODEV;
+			goto postchange;
+		}
+	}
+	/* Set new multiplier and voltage */
+	wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
+	/* Wait until transition end */
+	i = 0;
+	do {
+		udelay(16);
+		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+		i++;
+		if (unlikely(i > 64)) {
+			err = -ENODEV;
+			goto postchange;
+		}
+	} while (lo & ((1 << 16) | (1 << 17)));
+
+	/* Return current frequency */
+postchange:
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	return err;
+}
+
+static int eps_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq,
+			       unsigned int relation)
+{
+	struct eps_cpu_data *centaur;
+	unsigned int newstate = 0;
+	unsigned int cpu = policy->cpu;
+	unsigned int dest_state;
+	int ret;
+
+	if (unlikely(eps_cpu[cpu] == NULL))
+		return -ENODEV;
+	centaur = eps_cpu[cpu];
+
+	if (unlikely(cpufreq_frequency_table_target(policy,
+			&eps_cpu[cpu]->freq_table[0],
+			target_freq,
+			relation,
+			&newstate))) {
+		return -EINVAL;
+	}
+
+	/* Make frequency transition */
+	dest_state = centaur->freq_table[newstate].index & 0xffff;
+	ret = eps_set_state(centaur, cpu, dest_state);
+	if (ret)
+		printk(KERN_ERR "eps: Timeout!\n");
+	return ret;
+}
+
+static int eps_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy,
+			&eps_cpu[policy->cpu]->freq_table[0]);
+}
+
+static int eps_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i;
+	u32 lo, hi;
+	u64 val;
+	u8 current_multiplier, current_voltage;
+	u8 max_multiplier, max_voltage;
+	u8 min_multiplier, min_voltage;
+	u8 brand;
+	u32 fsb;
+	struct eps_cpu_data *centaur;
+	struct cpufreq_frequency_table *f_table;
+	int k, step, voltage;
+	int ret;
+	int states;
+
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	/* Check brand */
+	printk("eps: Detected VIA ");
+	rdmsr(0x1153, lo, hi);
+	brand = (((lo >> 2) ^ lo) >> 18) & 3;
+	switch(brand) {
+	case EPS_BRAND_C7M:
+		printk("C7-M\n");
+		break;
+	case EPS_BRAND_C7:
+		printk("C7\n");
+		break;
+	case EPS_BRAND_EDEN:
+		printk("Eden\n");
+		break;
+	case EPS_BRAND_C3:
+		printk("C3\n");
+		return -ENODEV;
+		break;
+	}
+	/* Enable Enhanced PowerSaver */
+	rdmsrl(MSR_IA32_MISC_ENABLE, val);
+	if (!(val & 1 << 16)) {
+		val |= 1 << 16;
+		wrmsrl(MSR_IA32_MISC_ENABLE, val);
+		/* Can be locked at 0 */
+		rdmsrl(MSR_IA32_MISC_ENABLE, val);
+		if (!(val & 1 << 16)) {
+			printk("eps: Can't enable Enhanced PowerSaver\n");
+			return -ENODEV;
+		}
+	}
+
+	/* Print voltage and multiplier */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	current_voltage = lo & 0xff;
+	printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
+	current_multiplier = (lo >> 8) & 0xff;
+	printk("eps: Current multiplier = %d\n", current_multiplier);
+
+	/* Print limits */
+	max_voltage = hi & 0xff;
+	printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
+	max_multiplier = (hi >> 8) & 0xff;
+	printk("eps: Highest multiplier = %d\n", max_multiplier);
+	min_voltage = (hi >> 16) & 0xff;
+	printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
+	min_multiplier = (hi >> 24) & 0xff;
+	printk("eps: Lowest multiplier = %d\n", min_multiplier);
+
+	/* Sanity checks */
+	if (current_multiplier == 0 || max_multiplier == 0
+	    || min_multiplier == 0)
+		return -EINVAL;
+	if (current_multiplier > max_multiplier
+	    || max_multiplier <= min_multiplier)
+		return -EINVAL;
+	if (current_voltage > 0x1c || max_voltage > 0x1c)
+		return -EINVAL;
+	if (max_voltage < min_voltage)
+		return -EINVAL;
+
+	/* Calc FSB speed */
+	fsb = cpu_khz / current_multiplier;
+	/* Calc number of p-states supported */
+	if (brand == EPS_BRAND_C7M)
+		states = max_multiplier - min_multiplier + 1;
+	else
+		states = 2;
+
+	/* Allocate private data and frequency table for current cpu */
+	centaur = kzalloc(sizeof(struct eps_cpu_data)
+		    + (states + 1) * sizeof(struct cpufreq_frequency_table),
+		    GFP_KERNEL);
+	if (!centaur)
+		return -ENOMEM;
+	eps_cpu[0] = centaur;
+
+	/* Copy basic values */
+	centaur->fsb = fsb;
+
+	/* Fill frequency and MSR value table */
+	f_table = &centaur->freq_table[0];
+	if (brand != EPS_BRAND_C7M) {
+		f_table[0].frequency = fsb * min_multiplier;
+		f_table[0].index = (min_multiplier << 8) | min_voltage;
+		f_table[1].frequency = fsb * max_multiplier;
+		f_table[1].index = (max_multiplier << 8) | max_voltage;
+		f_table[2].frequency = CPUFREQ_TABLE_END;
+	} else {
+		k = 0;
+		step = ((max_voltage - min_voltage) * 256)
+			/ (max_multiplier - min_multiplier);
+		for (i = min_multiplier; i <= max_multiplier; i++) {
+			voltage = (k * step) / 256 + min_voltage;
+			f_table[k].frequency = fsb * i;
+			f_table[k].index = (i << 8) | voltage;
+			k++;
+		}
+		f_table[k].frequency = CPUFREQ_TABLE_END;
+	}
+
+	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+	policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
+	policy->cur = fsb * current_multiplier;
+
+	ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
+	if (ret) {
+		kfree(centaur);
+		return ret;
+	}
+
+	cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
+	return 0;
+}
+
+static int eps_cpu_exit(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	struct eps_cpu_data *centaur;
+	u32 lo, hi;
+
+	if (eps_cpu[cpu] == NULL)
+		return -ENODEV;
+	centaur = eps_cpu[cpu];
+
+	/* Get max frequency */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	/* Set max frequency */
+	eps_set_state(centaur, cpu, hi & 0xffff);
+	/* Bye */
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	kfree(eps_cpu[cpu]);
+	eps_cpu[cpu] = NULL;
+	return 0;
+}
+
+static struct freq_attr* eps_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver eps_driver = {
+	.verify		= eps_verify,
+	.target		= eps_target,
+	.init		= eps_cpu_init,
+	.exit		= eps_cpu_exit,
+	.get		= eps_get,
+	.name		= "e_powersaver",
+	.owner		= THIS_MODULE,
+	.attr		= eps_attr,
+};
+
+static int __init eps_init(void)
+{
+	struct cpuinfo_x86 *c = cpu_data;
+
+	/* This driver will work only on Centaur C7 processors with
+	 * Enhanced SpeedStep/PowerSaver registers */
+	if (c->x86_vendor != X86_VENDOR_CENTAUR
+	    || c->x86 != 6 || c->x86_model != 10)
+		return -ENODEV;
+	if (!cpu_has(c, X86_FEATURE_EST))
+		return -ENODEV;
+
+	if (cpufreq_register_driver(&eps_driver))
+		return -EINVAL;
+	return 0;
+}
+
+static void __exit eps_exit(void)
+{
+	cpufreq_unregister_driver(&eps_driver);
+}
+
+MODULE_AUTHOR("Rafa� Bilski <rafalbilski@interia.pl>");
+MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
+MODULE_LICENSE("GPL");
+
+module_init(eps_init);
+module_exit(eps_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index e940e00b96c9..b59878a0d9b3 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -8,12 +8,11 @@
  *  VIA have currently 3 different versions of Longhaul.
  *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
  *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- *  Version 2 of longhaul is the same as v1, but adds voltage scaling.
- *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C)
- *   voltage scaling support has currently been disabled in this driver
- *   until we have code that gets it right.
+ *  Version 2 of longhaul is backward compatible with v1, but adds
+ *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
+ *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
  *  Version 3 of longhaul got renamed to Powersaver and redesigned
- *   to use the POWERSAVER MSR at 0x110a.
+ *   to use only the POWERSAVER MSR at 0x110a.
  *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
  *   It's pretty much the same feature wise to longhaul v2, though
  *   there is provision for scaling FSB too, but this doesn't work
@@ -51,10 +50,12 @@
 #define	CPU_EZRA	3
 #define	CPU_EZRA_T	4
 #define	CPU_NEHEMIAH	5
+#define	CPU_NEHEMIAH_C	6
 
 /* Flags */
 #define USE_ACPI_C3		(1 << 1)
 #define USE_NORTHBRIDGE		(1 << 2)
+#define USE_VT8235		(1 << 3)
 
 static int cpu_model;
 static unsigned int numscales=16;
@@ -63,7 +64,8 @@ static unsigned int fsb;
 static struct mV_pos *vrm_mV_table;
 static unsigned char *mV_vrm_table;
 struct f_msr {
-	unsigned char vrm;
+	u8 vrm;
+	u8 pos;
 };
 static struct f_msr f_msr_table[32];
 
@@ -73,10 +75,10 @@ static int can_scale_voltage;
 static struct acpi_processor *pr = NULL;
 static struct acpi_processor_cx *cx = NULL;
 static u8 longhaul_flags;
+static u8 longhaul_pos;
 
 /* Module parameters */
 static int scale_voltage;
-static int ignore_latency;
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
 
@@ -164,41 +166,79 @@ static void do_longhaul1(unsigned int clock_ratio_index)
 static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
 {
 	union msr_longhaul longhaul;
+	u8 dest_pos;
 	u32 t;
 
+	dest_pos = f_msr_table[clock_ratio_index].pos;
+
 	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	/* Setup new frequency */
 	longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
 	longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
 	longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
-	longhaul.bits.EnableSoftBusRatio = 1;
-
-	if (can_scale_voltage) {
+	/* Setup new voltage */
+	if (can_scale_voltage)
 		longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm;
+	/* Sync to timer tick */
+	safe_halt();
+	/* Raise voltage if necessary */
+	if (can_scale_voltage && longhaul_pos < dest_pos) {
 		longhaul.bits.EnableSoftVID = 1;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		/* Change voltage */
+		if (!cx_address) {
+			ACPI_FLUSH_CPU_CACHE();
+			halt();
+		} else {
+			ACPI_FLUSH_CPU_CACHE();
+			/* Invoke C3 */
+			inb(cx_address);
+			/* Dummy op - must do something useless after P_LVL3
+			 * read */
+			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		}
+		longhaul.bits.EnableSoftVID = 0;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		longhaul_pos = dest_pos;
 	}
 
-	/* Sync to timer tick */
-	safe_halt();
 	/* Change frequency on next halt or sleep */
+	longhaul.bits.EnableSoftBusRatio = 1;
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
 	if (!cx_address) {
 		ACPI_FLUSH_CPU_CACHE();
-		/* Invoke C1 */
 		halt();
 	} else {
 		ACPI_FLUSH_CPU_CACHE();
 		/* Invoke C3 */
 		inb(cx_address);
 		/* Dummy op - must do something useless after P_LVL3 read */
-		t = inl(acpi_fadt.xpm_tmr_blk.address);
+		t = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
 	/* Disable bus ratio bit */
-	local_irq_disable();
-	longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
 	longhaul.bits.EnableSoftBusRatio = 0;
-	longhaul.bits.EnableSoftBSEL = 0;
-	longhaul.bits.EnableSoftVID = 0;
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+
+	/* Reduce voltage if necessary */
+	if (can_scale_voltage && longhaul_pos > dest_pos) {
+		longhaul.bits.EnableSoftVID = 1;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		/* Change voltage */
+		if (!cx_address) {
+			ACPI_FLUSH_CPU_CACHE();
+			halt();
+		} else {
+			ACPI_FLUSH_CPU_CACHE();
+			/* Invoke C3 */
+			inb(cx_address);
+			/* Dummy op - must do something useless after P_LVL3
+			 * read */
+			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		}
+		longhaul.bits.EnableSoftVID = 0;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		longhaul_pos = dest_pos;
+	}
 }
 
 /**
@@ -250,39 +290,30 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 		outb(3, 0x22);
 	} else if ((pr != NULL) && pr->flags.bm_control) {
  		/* Disable bus master arbitration */
-		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1,
-				  ACPI_MTX_DO_NOT_LOCK);
+		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
 	}
 	switch (longhaul_version) {
 
 	/*
 	 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
 	 * Software controlled multipliers only.
-	 *
-	 * *NB* Until we get voltage scaling working v1 & v2 are the same code.
-	 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C]
 	 */
 	case TYPE_LONGHAUL_V1:
-	case TYPE_LONGHAUL_V2:
 		do_longhaul1(clock_ratio_index);
 		break;
 
 	/*
+	 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
+	 *
 	 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
-	 * We can scale voltage with this too, but that's currently
-	 * disabled until we come up with a decent 'match freq to voltage'
-	 * algorithm.
-	 * When we add voltage scaling, we will also need to do the
-	 * voltage/freq setting in order depending on the direction
-	 * of scaling (like we do in powernow-k7.c)
 	 * Nehemiah can do FSB scaling too, but this has never been proven
 	 * to work in practice.
 	 */
+	case TYPE_LONGHAUL_V2:
 	case TYPE_POWERSAVER:
 		if (longhaul_flags & USE_ACPI_C3) {
 			/* Don't allow wakeup */
-			acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0,
-					  ACPI_MTX_DO_NOT_LOCK);
+			acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
 			do_powersaver(cx->address, clock_ratio_index);
 		} else {
 			do_powersaver(0, clock_ratio_index);
@@ -295,8 +326,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 		outb(0, 0x22);
 	} else if ((pr != NULL) && pr->flags.bm_control) {
 		/* Enable bus master arbitration */
-		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0,
-				  ACPI_MTX_DO_NOT_LOCK);
+		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
 	}
 	outb(pic2_mask,0xA1);	/* restore mask */
 	outb(pic1_mask,0x21);
@@ -304,6 +334,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 	local_irq_restore(flags);
 	preempt_enable();
 
+	freqs.new = calc_speed(longhaul_get_cpu_mult());
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 }
 
@@ -318,31 +349,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 
 #define ROUNDING	0xf
 
-static int _guess(int guess, int mult)
-{
-	int target;
-
-	target = ((mult/10)*guess);
-	if (mult%10 != 0)
-		target += (guess/2);
-	target += ROUNDING/2;
-	target &= ~ROUNDING;
-	return target;
-}
-
-
 static int guess_fsb(int mult)
 {
-	int speed = (cpu_khz/1000);
+	int speed = cpu_khz / 1000;
 	int i;
-	int speeds[] = { 66, 100, 133, 200 };
-
-	speed += ROUNDING/2;
-	speed &= ~ROUNDING;
-
-	for (i=0; i<4; i++) {
-		if (_guess(speeds[i], mult) == speed)
-			return speeds[i];
+	int speeds[] = { 666, 1000, 1333, 2000 };
+	int f_max, f_min;
+
+	for (i = 0; i < 4; i++) {
+		f_max = ((speeds[i] * mult) + 50) / 100;
+		f_max += (ROUNDING / 2);
+		f_min = f_max - ROUNDING;
+		if ((speed <= f_max) && (speed >= f_min))
+			return speeds[i] / 10;
 	}
 	return 0;
 }
@@ -350,71 +369,44 @@ static int guess_fsb(int mult)
 
 static int __init longhaul_get_ranges(void)
 {
-	unsigned long invalue;
-	unsigned int ezra_t_multipliers[32]= {
-			90,  30,  40, 100,  55,  35,  45,  95,
-			50,  70,  80,  60, 120,  75,  85,  65,
-			-1, 110, 120,  -1, 135, 115, 125, 105,
-			130, 150, 160, 140,  -1, 155,  -1, 145 };
 	unsigned int j, k = 0;
-	union msr_longhaul longhaul;
-	int mult = 0;
+	int mult;
 
-	switch (longhaul_version) {
-	case TYPE_LONGHAUL_V1:
-	case TYPE_LONGHAUL_V2:
-		/* Ugh, Longhaul v1 didn't have the min/max MSRs.
-		   Assume min=3.0x & max = whatever we booted at. */
+	/* Get current frequency */
+	mult = longhaul_get_cpu_mult();
+	if (mult == -1) {
+		printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
+		return -EINVAL;
+	}
+	fsb = guess_fsb(mult);
+	if (fsb == 0) {
+		printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
+		return -EINVAL;
+	}
+	/* Get max multiplier - as we always did.
+	 * Longhaul MSR is usefull only when voltage scaling is enabled.
+	 * C3 is booting at max anyway. */
+	maxmult = mult;
+	/* Get min multiplier */
+	switch (cpu_model) {
+	case CPU_NEHEMIAH:
+		minmult = 50;
+		break;
+	case CPU_NEHEMIAH_C:
+		minmult = 40;
+		break;
+	default:
 		minmult = 30;
-		maxmult = mult = longhaul_get_cpu_mult();
 		break;
-
-	case TYPE_POWERSAVER:
-		/* Ezra-T */
-		if (cpu_model==CPU_EZRA_T) {
-			minmult = 30;
-			rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
-			invalue = longhaul.bits.MaxMHzBR;
-			if (longhaul.bits.MaxMHzBR4)
-				invalue += 16;
-			maxmult = mult = ezra_t_multipliers[invalue];
-			break;
-		}
-
-		/* Nehemiah */
-		if (cpu_model==CPU_NEHEMIAH) {
-			rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
-
-			/*
-			 * TODO: This code works, but raises a lot of questions.
-			 * - Some Nehemiah's seem to have broken Min/MaxMHzBR's.
-			 *   We get around this by using a hardcoded multiplier of 4.0x
-			 *   for the minimimum speed, and the speed we booted up at for the max.
-			 *   This is done in longhaul_get_cpu_mult() by reading the EBLCR register.
-			 * - According to some VIA documentation EBLCR is only
-			 *   in pre-Nehemiah C3s. How this still works is a mystery.
-			 *   We're possibly using something undocumented and unsupported,
-			 *   But it works, so we don't grumble.
-			 */
-			minmult=40;
-			maxmult = mult = longhaul_get_cpu_mult();
-			break;
-		}
 	}
-	fsb = guess_fsb(mult);
 
 	dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
 		 minmult/10, minmult%10, maxmult/10, maxmult%10);
 
-	if (fsb == 0) {
-		printk (KERN_INFO PFX "Invalid (reserved) FSB!\n");
-		return -EINVAL;
-	}
-
 	highest_speed = calc_speed(maxmult);
 	lowest_speed = calc_speed(minmult);
 	dprintk ("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
-		 print_speed(lowest_speed/1000), 
+		 print_speed(lowest_speed/1000),
 		 print_speed(highest_speed/1000));
 
 	if (lowest_speed == highest_speed) {
@@ -458,6 +450,7 @@ static void __init longhaul_setup_voltagescaling(void)
 	union msr_longhaul longhaul;
 	struct mV_pos minvid, maxvid;
 	unsigned int j, speed, pos, kHz_step, numvscales;
+	int min_vid_speed;
 
 	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
 	if (!(longhaul.bits.RevisionID & 1)) {
@@ -471,14 +464,14 @@ static void __init longhaul_setup_voltagescaling(void)
 		mV_vrm_table = &mV_vrm85[0];
 	} else {
 		printk (KERN_INFO PFX "Mobile VRM\n");
+		if (cpu_model < CPU_NEHEMIAH)
+			return;
 		vrm_mV_table = &mobilevrm_mV[0];
 		mV_vrm_table = &mV_mobilevrm[0];
 	}
 
 	minvid = vrm_mV_table[longhaul.bits.MinimumVID];
 	maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-	numvscales = maxvid.pos - minvid.pos + 1;
-	kHz_step = (highest_speed - lowest_speed) / numvscales;
 
 	if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
 		printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
@@ -494,20 +487,59 @@ static void __init longhaul_setup_voltagescaling(void)
 		return;
 	}
 
-	printk(KERN_INFO PFX "Max VID=%d.%03d  Min VID=%d.%03d, %d possible voltage scales\n",
+	/* How many voltage steps */
+	numvscales = maxvid.pos - minvid.pos + 1;
+	printk(KERN_INFO PFX
+		"Max VID=%d.%03d  "
+		"Min VID=%d.%03d, "
+		"%d possible voltage scales\n",
 		maxvid.mV/1000, maxvid.mV%1000,
 		minvid.mV/1000, minvid.mV%1000,
 		numvscales);
-	
+
+	/* Calculate max frequency at min voltage */
+	j = longhaul.bits.MinMHzBR;
+	if (longhaul.bits.MinMHzBR4)
+		j += 16;
+	min_vid_speed = eblcr_table[j];
+	if (min_vid_speed == -1)
+		return;
+	switch (longhaul.bits.MinMHzFSB) {
+	case 0:
+		min_vid_speed *= 13333;
+		break;
+	case 1:
+		min_vid_speed *= 10000;
+		break;
+	case 3:
+		min_vid_speed *= 6666;
+		break;
+	default:
+		return;
+		break;
+	}
+	if (min_vid_speed >= highest_speed)
+		return;
+	/* Calculate kHz for one voltage step */
+	kHz_step = (highest_speed - min_vid_speed) / numvscales;
+
+
 	j = 0;
 	while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
 		speed = longhaul_table[j].frequency;
-		pos = (speed - lowest_speed) / kHz_step + minvid.pos;
+		if (speed > min_vid_speed)
+			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
+		else
+			pos = minvid.pos;
 		f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos];
+		f_msr_table[longhaul_table[j].index].pos = pos;
 		j++;
 	}
 
+	longhaul_pos = maxvid.pos;
 	can_scale_voltage = 1;
+	printk(KERN_INFO PFX "Voltage scaling enabled. "
+		"Use of \"conservative\" governor is highly recommended.\n");
 }
 
 
@@ -576,20 +608,51 @@ static int enable_arbiter_disable(void)
 	if (dev != NULL) {
 		/* Enable access to port 0x22 */
 		pci_read_config_byte(dev, reg, &pci_cmd);
-		if ( !(pci_cmd & 1<<7) ) {
+		if (!(pci_cmd & 1<<7)) {
 			pci_cmd |= 1<<7;
 			pci_write_config_byte(dev, reg, pci_cmd);
+			pci_read_config_byte(dev, reg, &pci_cmd);
+			if (!(pci_cmd & 1<<7)) {
+				printk(KERN_ERR PFX
+					"Can't enable access to port 0x22.\n");
+				return 0;
+			}
 		}
 		return 1;
 	}
 	return 0;
 }
 
+static int longhaul_setup_vt8235(void)
+{
+	struct pci_dev *dev;
+	u8 pci_cmd;
+
+	/* Find VT8235 southbridge */
+	dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
+	if (dev != NULL) {
+		/* Set transition time to max */
+		pci_read_config_byte(dev, 0xec, &pci_cmd);
+		pci_cmd &= ~(1 << 2);
+		pci_write_config_byte(dev, 0xec, pci_cmd);
+		pci_read_config_byte(dev, 0xe4, &pci_cmd);
+		pci_cmd &= ~(1 << 7);
+		pci_write_config_byte(dev, 0xe4, pci_cmd);
+		pci_read_config_byte(dev, 0xe5, &pci_cmd);
+		pci_cmd |= 1 << 7;
+		pci_write_config_byte(dev, 0xe5, pci_cmd);
+		return 1;
+	}
+	return 0;
+}
+
 static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpuinfo_x86 *c = cpu_data;
 	char *cpuname=NULL;
 	int ret;
+	u32 lo, hi;
+	int vt8235_present;
 
 	/* Check what we have on this motherboard */
 	switch (c->x86_model) {
@@ -602,16 +665,20 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		break;
 
 	case 7:
-		longhaul_version = TYPE_LONGHAUL_V1;
 		switch (c->x86_mask) {
 		case 0:
+			longhaul_version = TYPE_LONGHAUL_V1;
 			cpu_model = CPU_SAMUEL2;
 			cpuname = "C3 'Samuel 2' [C5B]";
-			/* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */
-			memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
-			memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr));
+			/* Note, this is not a typo, early Samuel2's had
+			 * Samuel1 ratios. */
+			memcpy(clock_ratio, samuel1_clock_ratio,
+				sizeof(samuel1_clock_ratio));
+			memcpy(eblcr_table, samuel2_eblcr,
+				sizeof(samuel2_eblcr));
 			break;
 		case 1 ... 15:
+			longhaul_version = TYPE_LONGHAUL_V2;
 			if (c->x86_mask < 8) {
 				cpu_model = CPU_SAMUEL2;
 				cpuname = "C3 'Samuel 2' [C5B]";
@@ -619,8 +686,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 				cpu_model = CPU_EZRA;
 				cpuname = "C3 'Ezra' [C5C]";
 			}
-			memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio));
-			memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr));
+			memcpy(clock_ratio, ezra_clock_ratio,
+				sizeof(ezra_clock_ratio));
+			memcpy(eblcr_table, ezra_eblcr,
+				sizeof(ezra_eblcr));
 			break;
 		}
 		break;
@@ -635,24 +704,24 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		break;
 
 	case 9:
-		cpu_model = CPU_NEHEMIAH;
 		longhaul_version = TYPE_POWERSAVER;
-		numscales=32;
+		numscales = 32;
+		memcpy(clock_ratio,
+		       nehemiah_clock_ratio,
+		       sizeof(nehemiah_clock_ratio));
+		memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
 		switch (c->x86_mask) {
 		case 0 ... 1:
-			cpuname = "C3 'Nehemiah A' [C5N]";
-			memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio));
-			memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr));
+			cpu_model = CPU_NEHEMIAH;
+			cpuname = "C3 'Nehemiah A' [C5XLOE]";
 			break;
 		case 2 ... 4:
-			cpuname = "C3 'Nehemiah B' [C5N]";
-			memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio));
-			memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr));
+			cpu_model = CPU_NEHEMIAH;
+			cpuname = "C3 'Nehemiah B' [C5XLOH]";
 			break;
 		case 5 ... 15:
-			cpuname = "C3 'Nehemiah C' [C5N]";
-			memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio));
-			memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr));
+			cpu_model = CPU_NEHEMIAH_C;
+			cpuname = "C3 'Nehemiah C' [C5P]";
 			break;
 		}
 		break;
@@ -661,6 +730,13 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		cpuname = "Unknown";
 		break;
 	}
+	/* Check Longhaul ver. 2 */
+	if (longhaul_version == TYPE_LONGHAUL_V2) {
+		rdmsr(MSR_VIA_LONGHAUL, lo, hi);
+		if (lo == 0 && hi == 0)
+			/* Looks like MSR isn't present */
+			longhaul_version = TYPE_LONGHAUL_V1;
+	}
 
 	printk (KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
 	switch (longhaul_version) {
@@ -673,15 +749,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		break;
 	};
 
+	/* Doesn't hurt */
+	vt8235_present = longhaul_setup_vt8235();
+
 	/* Find ACPI data for processor */
-	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX,
-			    &longhaul_walk_callback, NULL, (void *)&pr);
+	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+				ACPI_UINT32_MAX, &longhaul_walk_callback,
+				NULL, (void *)&pr);
 
 	/* Check ACPI support for C3 state */
-	if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) {
+	if (pr != NULL && longhaul_version != TYPE_LONGHAUL_V1) {
 		cx = &pr->power.states[ACPI_STATE_C3];
-		if (cx->address > 0 &&
-		   (cx->latency <= 1000 || ignore_latency != 0) ) {
+		if (cx->address > 0 && cx->latency <= 1000) {
 			longhaul_flags |= USE_ACPI_C3;
 			goto print_support_type;
 		}
@@ -691,8 +770,11 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		longhaul_flags |= USE_NORTHBRIDGE;
 		goto print_support_type;
 	}
-
-	/* No ACPI C3 or we can't use it */
+	/* Use VT8235 southbridge if present */
+	if (longhaul_version == TYPE_POWERSAVER && vt8235_present) {
+		longhaul_flags |= USE_VT8235;
+		goto print_support_type;
+	}
 	/* Check ACPI support for bus master arbiter disable */
 	if ((pr == NULL) || !(pr->flags.bm_control)) {
 		printk(KERN_ERR PFX
@@ -701,18 +783,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 	}
 
 print_support_type:
-	if (!(longhaul_flags & USE_NORTHBRIDGE)) {
-		printk (KERN_INFO PFX "Using ACPI support.\n");
-	} else {
+	if (longhaul_flags & USE_NORTHBRIDGE)
 		printk (KERN_INFO PFX "Using northbridge support.\n");
-	}
+	else if (longhaul_flags & USE_VT8235)
+		printk (KERN_INFO PFX "Using VT8235 support.\n");
+	else
+		printk (KERN_INFO PFX "Using ACPI support.\n");
 
 	ret = longhaul_get_ranges();
 	if (ret != 0)
 		return ret;
 
-	if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) &&
-		 (scale_voltage != 0))
+	if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
 		longhaul_setup_voltagescaling();
 
 	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
@@ -800,8 +882,6 @@ static void __exit longhaul_exit(void)
 
 module_param (scale_voltage, int, 0644);
 MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-module_param(ignore_latency, int, 0644);
-MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test");
 
 MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
 MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
index bc4682aad69b..bb0a04b1d1ab 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -235,84 +235,14 @@ static int __initdata ezrat_eblcr[32] = {
 /*
  * VIA C3 Nehemiah */
 
-static int __initdata nehemiah_a_clock_ratio[32] = {
+static int __initdata  nehemiah_clock_ratio[32] = {
 	100, /* 0000 -> 10.0x */
 	160, /* 0001 -> 16.0x */
-	-1,  /* 0010 ->  RESERVED */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	-1,  /* 0101 ->  RESERVED */
-	-1,  /* 0110 ->  RESERVED */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-	100, /* 0000 -> 10.0x */
-	-1,  /* 0001 -> RESERVED */
-	120, /* 0010 -> 12.0x */
-	90,  /* 0011 ->  9.0x */
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	120, /* 1111 -> 12.0x */
-};
-
-static int __initdata  nehemiah_b_clock_ratio[32] = {
-	100, /* 0000 -> 10.0x */
-	160, /* 0001 -> 16.0x */
-	-1,  /* 0010 ->  RESERVED */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	-1,  /* 0101 ->  RESERVED */
-	-1,  /* 0110 ->  RESERVED */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-	100, /* 0000 -> 10.0x */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	90,  /* 0011 ->  9.0x */
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	120, /* 1111 -> 12.0x */
-};
-
-static int __initdata  nehemiah_c_clock_ratio[32] = {
-	100, /* 0000 -> 10.0x */
-	160, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  RESERVED */
+	40,  /* 0010 ->  4.0x */
 	90,  /* 0011 ->  9.0x */
 	95,  /* 0100 ->  9.5x */
 	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  RESERVED */
+	45,  /* 0110 ->  4.5x */
 	55,  /* 0111 ->  5.5x */
 	60,  /* 1000 ->  6.0x */
 	70,  /* 1001 ->  7.0x */
@@ -340,84 +270,14 @@ static int __initdata  nehemiah_c_clock_ratio[32] = {
 	120, /* 1111 -> 12.0x */
 };
 
-static int __initdata nehemiah_a_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	160, /* 0001 -> 16.0x */
-	-1,  /* 0010 ->  RESERVED */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	-1,  /* 0101 ->  RESERVED */
-	-1,  /* 0110 ->  RESERVED */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-	90,  /* 0000 ->  9.0x */
-	-1,  /* 0001 -> RESERVED */
-	120, /* 0010 -> 12.0x */
-	100, /* 0011 -> 10.0x */
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	120, /* 1100 -> 12.0x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145 /* 1111 -> 14.5x */
-   /* end of table  */
-};
-static int __initdata nehemiah_b_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	160, /* 0001 -> 16.0x */
-	-1,  /* 0010 ->  RESERVED */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	-1,  /* 0101 ->  RESERVED */
-	-1,  /* 0110 ->  RESERVED */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-	90,  /* 0000 ->  9.0x */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	100, /* 0011 -> 10.0x */
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	120, /* 1100 -> 12.0x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145 /* 1111 -> 14.5x */
-	   /* end of table  */
-};
-static int __initdata nehemiah_c_eblcr[32] = {
+static int __initdata nehemiah_eblcr[32] = {
 	50,  /* 0000 ->  5.0x */
 	160, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  RESERVED */
+	40,  /* 0010 ->  4.0x */
 	100, /* 0011 -> 10.0x */
 	55,  /* 0100 ->  5.5x */
 	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  RESERVED */
+	45,  /* 0110 ->  4.5x */
 	95,  /* 0111 ->  9.5x */
 	90,  /* 1000 ->  9.0x */
 	70,  /* 1001 ->  7.0x */
@@ -443,7 +303,6 @@ static int __initdata nehemiah_c_eblcr[32] = {
 	155, /* 1101 -> 15.5x */
 	-1,  /* 1110 -> RESERVED (13.0x) */
 	145 /* 1111 -> 14.5x */
-	  /* end of table  */
 };
 
 /*
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 2d6491672559..fe3b67005ebb 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -1289,7 +1289,11 @@ static unsigned int powernowk8_get (unsigned int cpu)
 	if (query_current_values_with_pending_wait(data))
 		goto out;
 
-	khz = find_khz_freq_from_fid(data->currfid);
+	if (cpu_family == CPU_HW_PSTATE)
+		khz = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+	else
+		khz = find_khz_freq_from_fid(data->currfid);
+
 
 out:
 	set_cpus_allowed(current, oldmask);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32c..de27bd07bc9c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -6,6 +6,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/pci-direct.h>
 
 #include "cpu.h"
 
@@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void)
 static void __cpuinit geode_configure(void)
 {
 	unsigned long flags;
-	u8 ccr3, ccr4;
+	u8 ccr3;
 	local_irq_save(flags);
 
 	/* Suspend on halt power saving and enable #SUSP pin */
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
 
 	ccr3 = getCx86(CX86_CCR3);
-	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* Enable */
-	
-	ccr4 = getCx86(CX86_CCR4);
-	ccr4 |= 0x38;		/* FPU fast, DTE cache, Mem bypass */
+	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
 	
-	setCx86(CX86_CCR3, ccr3);
+
+	/* FPU fast, DTE cache, Mem bypass */
+	setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+	setCx86(CX86_CCR3, ccr3);			/* disable MAPEN */
 	
 	set_cx86_memwb();
 	set_cx86_reorder();	
@@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void)
 }
 
 
-#ifdef CONFIG_PCI
-static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
-	{ },
-};
-#endif
-
 static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 {
 	unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
@@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 
 	case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
 #ifdef CONFIG_PCI
+	{
+		u32 vendor, device;
 		/* It isn't really a PCI quirk directly, but the cure is the
 		   same. The MediaGX has deep magic SMM stuff that handles the
 		   SB emulation. It thows away the fifo on disable_dma() which
@@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 		printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
 		isa_dma_bridge_buggy = 2;
 
+		/* We do this before the PCI layer is running. However we
+		   are safe here as we know the bridge must be a Cyrix
+		   companion and must be present */
+		vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
+		device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
 
 		/*
 		 *  The 5510/5520 companion chips have a funky PIT.
 		 */  
-		if (pci_dev_present(cyrix_55x0))
+		if (vendor == PCI_VENDOR_ID_CYRIX &&
+	 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
 			pit_latch_buggy = 1;
+	}
 #endif
 		c->x86_cache_size=16;	/* Yep 16K integrated cache thats it */
 
 		/* GXm supports extended cpuid levels 'ala' AMD */
 		if (c->cpuid_level == 2) {
 			/* Enable cxMMX extensions (GX1 Datasheet 54) */
-			setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+			setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
 			
-			/* GXlv/GXm/GX1 */
-			if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63)
+			/*
+			 * GXm : 0x30 ... 0x5f GXm  datasheet 51
+			 * GXlv: 0x6x          GXlv datasheet 54
+			 *  ?  : 0x7x
+			 * GX1 : 0x8x          GX1  datasheet 56
+			 */
+			if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
 				geode_configure();
 			get_model_name(c);  /* get CPU marketing name */
 			return;
@@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
 		
    	        if (dir0 == 5 || dir0 == 3)
    	        {
-			unsigned char ccr3, ccr4;
+			unsigned char ccr3;
 			unsigned long flags;
 			printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
 			local_irq_save(flags);
 			ccr3 = getCx86(CX86_CCR3);
-			setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
-			ccr4 = getCx86(CX86_CCR4);
-			setCx86(CX86_CCR4, ccr4 | 0x80);          /* enable cpuid  */
-			setCx86(CX86_CCR3, ccr3);                 /* disable MAPEN */
+			setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN  */
+			setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);  /* enable cpuid  */
+			setCx86(CX86_CCR3, ccr3);                       /* disable MAPEN */
 			local_irq_restore(flags);
 		}
 	}
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index d555bec0db99..4f10c62d180c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
 
 #include <asm/processor.h> 
 #include <asm/system.h>
+#include <asm/mce.h>
 
 #include "mce.h"
 
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index 84fd4cf7d0fb..81fb6e2d35f3 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
 #include <linux/init.h>
+#include <asm/mce.h>
 
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
 /* Call the installed machine check handler for this CPU setup. */
 extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
 
-extern int mce_disabled;
 extern int nr_mce_banks;
 
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..8359c19d3a23 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -12,6 +12,7 @@
 #include <asm/system.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
+#include <asm/idle.h>
 
 #include <asm/therm_throt.h>
 
@@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm
 
 fastcall void smp_thermal_interrupt(struct pt_regs *regs)
 {
+	exit_idle();
 	irq_enter();
 	vendor_thermal_interrupt(regs);
 	irq_exit();
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ae1705eafa6..c7d8f1756745 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 	default:
 		return -ENOTTY;
 	case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_ADD_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 				  file, 0);
 		break;
 	case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_SET_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
 		break;
 	case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_DEL_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_file_del(sentry.base, sentry.size, file, 0);
 		break;
 	case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_KILL_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_del(-1, sentry.base, sentry.size);
 		break;
 	case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_GET_ENTRY:
+#endif
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
 		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 
 		break;
 	case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 				  file, 1);
 		break;
 	case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
 		break;
 	case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_file_del(sentry.base, sentry.size, file, 1);
 		break;
 	case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err = mtrr_del_page(-1, sentry.base, sentry.size);
 		break;
 	case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+	case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
 		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -339,7 +369,7 @@ static int mtrr_open(struct inode *inode, struct file *file)
 	return single_open(file, mtrr_seq_show, NULL);
 }
 
-static struct file_operations mtrr_fops = {
+static const struct file_operations mtrr_fops = {
 	.owner   = THIS_MODULE,
 	.open	 = mtrr_open, 
 	.read    = seq_read,
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 16bb7ea87145..0acfb6a5a220 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
 unsigned int *usage_table;
 static DEFINE_MUTEX(mtrr_mutex);
 
-u32 size_or_mask, size_and_mask;
+u64 size_or_mask, size_and_mask;
 
 static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
 
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
 			     boot_cpu_data.x86_mask == 0x4))
 				phys_addr = 36;
 
-			size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
-			size_and_mask = ~size_or_mask & 0xfff00000;
+			size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+			size_and_mask = ~size_or_mask & 0xfffff00000ULL;
 		} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
 			   boot_cpu_data.x86 == 6) {
 			/* VIA C* family have Intel style MTRRs, but
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index d61ea9db6cfe..289dfe6030e3 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
 
-extern u32 size_or_mask, size_and_mask;
+extern u64 size_or_mask, size_and_mask;
 extern struct mtrr_ops * mtrr_if;
 
 #define is_cpu(vnd)	(mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c42..47e3ebbfb28d 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
 
 		/* Transmeta-defined */
 		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		/* Intel-defined (#2) */
 		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
 		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
+		"sse4a", "misalignsse",
+		"3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 	};
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		"ttp",  /* thermal trip */
 		"tm",
 		"stc",
+		"100mhzsteps",
+		"hwpstate",
 		NULL,
-		/* nothing */	/* constant_tsc - moved to flags */
+		NULL,	/* constant_tsc - moved to flags */
+		/* nothing */
 	};
 	struct cpuinfo_x86 *c = v;
 	int i, n = c - cpu_data;
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 4056fb7d2cdf..5678d46863c6 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 {
 	unsigned int cap_mask, uk, max, dummy;
 	unsigned int cms_rev1, cms_rev2;
-	unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev;
+	unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
 	char cpu_info[65];
 
 	get_model_name(c);	/* Same as AMD/Cyrix */
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 	wrmsr(0x80860004, ~0, uk);
 	c->x86_capability[0] = cpuid_edx(0x00000001);
 	wrmsr(0x80860004, cap_mask, uk);
+
+	/* All Transmeta CPUs have a constant TSC */
+	set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
 	
 	/* If we can run i686 user-space code, call us an i686 */
 #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 51130b39cd2e..eeae0d992337 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
 #ifdef CONFIG_SMP
 
 struct cpuid_command {
-	int cpu;
 	u32 reg;
 	u32 *data;
 };
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block)
 {
 	struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
 
-	if (cmd->cpu == smp_processor_id())
-		cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
+	cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
 		      &cmd->data[3]);
 }
 
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data)
 	if (cpu == smp_processor_id()) {
 		cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
 	} else {
-		cmd.cpu = cpu;
 		cmd.reg = reg;
 		cmd.data = data;
 
-		smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1);
+		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
 	}
 	preempt_enable();
 }
@@ -148,7 +145,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
 /*
  * File operations we support
  */
-static struct file_operations cpuid_fops = {
+static const struct file_operations cpuid_fops = {
 	.owner = THIS_MODULE,
 	.llseek = cpuid_seek,
 	.read = cpuid_read,
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index f391abcf7da9..70f39560846a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/e820.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_EFI
 int efi_enabled = 0;
@@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
 } };
 
-static int romsignature(const unsigned char *x)
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
 {
 	unsigned short sig;
-	int ret = 0;
-	if (probe_kernel_address((const unsigned short *)x, sig) == 0)
-		ret = (sig == 0xaa55);
-	return ret;
+
+	return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
+	       sig == ROMSIGNATURE;
 }
 
 static int __init romchecksum(unsigned char *rom, unsigned long length)
 {
-	unsigned char *p, sum = 0;
+	unsigned char sum;
 
-	for (p = rom; p < rom + length; p++)
-		sum += *p;
+	for (sum = 0; length; length--)
+		sum += *rom++;
 	return sum == 0;
 }
 
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683fc63a..18bddcb8e9e8 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
  *	18(%esp) - %eax
  *	1C(%esp) - %ds
  *	20(%esp) - %es
- *	24(%esp) - %gs
+ *	24(%esp) - %fs
  *	28(%esp) - orig_eax
  *	2C(%esp) - %eip
  *	30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK		= 0x00020000
 
 #define SAVE_ALL \
 	cld; \
-	pushl %gs; \
+	pushl %fs; \
 	CFI_ADJUST_CFA_OFFSET 4;\
-	/*CFI_REL_OFFSET gs, 0;*/\
+	/*CFI_REL_OFFSET fs, 0;*/\
 	pushl %es; \
 	CFI_ADJUST_CFA_OFFSET 4;\
 	/*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK		= 0x00020000
 	movl %edx, %ds; \
 	movl %edx, %es; \
 	movl $(__KERNEL_PDA), %edx; \
-	movl %edx, %gs
+	movl %edx, %fs
 
 #define RESTORE_INT_REGS \
 	popl %ebx;	\
@@ -166,9 +166,9 @@ VM_MASK		= 0x00020000
 2:	popl %es;	\
 	CFI_ADJUST_CFA_OFFSET -4;\
 	/*CFI_RESTORE es;*/\
-3:	popl %gs;	\
+3:	popl %fs;	\
 	CFI_ADJUST_CFA_OFFSET -4;\
-	/*CFI_RESTORE gs;*/\
+	/*CFI_RESTORE fs;*/\
 .pushsection .fixup,"ax";	\
 4:	movl $0,(%esp);	\
 	jmp 1b;		\
@@ -227,6 +227,7 @@ ENTRY(ret_from_fork)
 	CFI_ADJUST_CFA_OFFSET -4
 	jmp syscall_exit
 	CFI_ENDPROC
+END(ret_from_fork)
 
 /*
  * Return to user mode is not as complex as all this looks,
@@ -258,6 +259,7 @@ ENTRY(resume_userspace)
 					# int/exception return?
 	jne work_pending
 	jmp restore_all
+END(ret_from_exception)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
@@ -272,6 +274,7 @@ need_resched:
 	jz restore_all
 	call preempt_schedule_irq
 	jmp need_resched
+END(resume_kernel)
 #endif
 	CFI_ENDPROC
 
@@ -349,16 +352,17 @@ sysenter_past_esp:
 	movl PT_OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
-1:	mov  PT_GS(%esp), %gs
+1:	mov  PT_FS(%esp), %fs
 	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
-2:	movl $0,PT_GS(%esp)
+2:	movl $0,PT_FS(%esp)
 	jmp 1b
 .section __ex_table,"a"
 	.align 4
 	.long 1b,2b
 .popsection
+ENDPROC(sysenter_entry)
 
 	# system call handler stub
 ENTRY(system_call)
@@ -459,6 +463,7 @@ ldt_ss:
 	CFI_ADJUST_CFA_OFFSET -8
 	jmp restore_nocheck
 	CFI_ENDPROC
+ENDPROC(system_call)
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
@@ -504,6 +509,7 @@ work_notifysig_v86:
 	xorl %edx, %edx
 	call do_notify_resume
 	jmp resume_userspace_sig
+END(work_pending)
 
 	# perform syscall exit tracing
 	ALIGN
@@ -519,6 +525,7 @@ syscall_trace_entry:
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
+END(syscall_trace_entry)
 
 	# perform syscall exit tracing
 	ALIGN
@@ -532,6 +539,7 @@ syscall_exit_work:
 	movl $1, %edx
 	call do_syscall_trace
 	jmp resume_userspace
+END(syscall_exit_work)
 	CFI_ENDPROC
 
 	RING0_INT_FRAME			# can't unwind into user space anyway
@@ -542,15 +550,17 @@ syscall_fault:
 	GET_THREAD_INFO(%ebp)
 	movl $-EFAULT,PT_EAX(%esp)
 	jmp resume_userspace
+END(syscall_fault)
 
 syscall_badsys:
 	movl $-ENOSYS,PT_EAX(%esp)
 	jmp resume_userspace
+END(syscall_badsys)
 	CFI_ENDPROC
 
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	movl %gs:PDA_cpu, %ebx; \
+	movl %fs:PDA_cpu, %ebx; \
 	PER_CPU(cpu_gdt_descr, %ebx); \
 	movl GDS_address(%ebx), %ebx; \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -581,9 +591,9 @@ syscall_badsys:
 ENTRY(interrupt)
 .text
 
-vector=0
 ENTRY(irq_entries_start)
 	RING0_INT_FRAME
+vector=0
 .rept NR_IRQS
 	ALIGN
  .if vector
@@ -592,11 +602,16 @@ ENTRY(irq_entries_start)
 1:	pushl $~(vector)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp common_interrupt
-.data
+ .previous
 	.long 1b
-.text
+ .text
 vector=vector+1
 .endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
 
 /*
  * the CPU automatically disables interrupts when executing an IRQ vector,
@@ -609,6 +624,7 @@ common_interrupt:
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
+ENDPROC(common_interrupt)
 	CFI_ENDPROC
 
 #define BUILD_INTERRUPT(name, nr)	\
@@ -621,18 +637,24 @@ ENTRY(name)				\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
 	jmp ret_from_intr;		\
-	CFI_ENDPROC
+	CFI_ENDPROC;			\
+ENDPROC(name)
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
+/* This alternate entry is needed because we hijack the apic LVTT */
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
+BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
+#endif
+
 KPROBE_ENTRY(page_fault)
 	RING0_EC_FRAME
 	pushl $do_page_fault
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
-	/* the function address is in %gs's slot on the stack */
+	/* the function address is in %fs's slot on the stack */
 	pushl %es
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET es, 0*/
@@ -661,20 +683,20 @@ error_code:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebx, 0
 	cld
-	pushl %gs
+	pushl %fs
 	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET gs, 0*/
+	/*CFI_REL_OFFSET fs, 0*/
 	movl $(__KERNEL_PDA), %ecx
-	movl %ecx, %gs
+	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	/*CFI_REGISTER es, ecx*/
-	movl PT_GS(%esp), %edi		# get the function address
+	movl PT_FS(%esp), %edi		# get the function address
 	movl PT_ORIG_EAX(%esp), %edx	# get the error code
 	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, ES*/
+	mov  %ecx, PT_FS(%esp)
+	/*CFI_REL_OFFSET fs, ES*/
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
@@ -692,6 +714,7 @@ ENTRY(coprocessor_error)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
@@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
 	RING0_INT_FRAME
@@ -721,6 +745,7 @@ device_not_available_emulate:
 	CFI_ADJUST_CFA_OFFSET -4
 	jmp ret_from_exception
 	CFI_ENDPROC
+END(device_not_available)
 
 /*
  * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -864,10 +889,12 @@ ENTRY(native_iret)
 	.align 4
 	.long 1b,iret_exc
 .previous
+END(native_iret)
 
 ENTRY(native_irq_enable_sysexit)
 	sti
 	sysexit
+END(native_irq_enable_sysexit)
 #endif
 
 KPROBE_ENTRY(int3)
@@ -890,6 +917,7 @@ ENTRY(overflow)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(overflow)
 
 ENTRY(bounds)
 	RING0_INT_FRAME
@@ -899,6 +927,7 @@ ENTRY(bounds)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(bounds)
 
 ENTRY(invalid_op)
 	RING0_INT_FRAME
@@ -908,6 +937,7 @@ ENTRY(invalid_op)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
 	RING0_INT_FRAME
@@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(coprocessor_segment_overrun)
 
 ENTRY(invalid_TSS)
 	RING0_EC_FRAME
@@ -924,6 +955,7 @@ ENTRY(invalid_TSS)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(invalid_TSS)
 
 ENTRY(segment_not_present)
 	RING0_EC_FRAME
@@ -931,6 +963,7 @@ ENTRY(segment_not_present)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(segment_not_present)
 
 ENTRY(stack_segment)
 	RING0_EC_FRAME
@@ -938,6 +971,7 @@ ENTRY(stack_segment)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(stack_segment)
 
 KPROBE_ENTRY(general_protection)
 	RING0_EC_FRAME
@@ -953,6 +987,7 @@ ENTRY(alignment_check)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(alignment_check)
 
 ENTRY(divide_error)
 	RING0_INT_FRAME
@@ -962,6 +997,7 @@ ENTRY(divide_error)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(divide_error)
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
@@ -972,6 +1008,7 @@ ENTRY(machine_check)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(machine_check)
 #endif
 
 ENTRY(spurious_interrupt_bug)
@@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
+END(spurious_interrupt_bug)
 
 ENTRY(kernel_thread_helper)
 	pushl $0		# fake return address for unwinder
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index edef5084ce17..3fa7f9389afe 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
  * any particular GDT layout, because we load our own as soon as we
  * can.
  */
+.section .text.head,"ax",@progbits
 ENTRY(startup_32)
 
 #ifdef CONFIG_PARAVIRT
@@ -103,7 +104,7 @@ ENTRY(startup_32)
 	movzwl OLD_CL_OFFSET,%esi
 	addl $(OLD_CL_BASE_ADDR),%esi
 2:
-	movl $(saved_command_line - __PAGE_OFFSET),%edi
+	movl $(boot_command_line - __PAGE_OFFSET),%edi
 	movl $(COMMAND_LINE_SIZE/4),%ecx
 	rep
 	movsl
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	jb 10b
 	movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
 
-#ifdef CONFIG_SMP
 	xorl %ebx,%ebx				/* This is the boot CPU (BSP) */
 	jmp 3f
-
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
  * we know the trampoline has already loaded the boot_gdt_table GDT
  * for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
  */
+
+#ifdef CONFIG_HOTPLUG_CPU
+.section .text,"ax",@progbits
+#else
+.section .init.text,"ax",@progbits
+#endif
+
+#ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
 	cld
 	movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
 	xorl %ebx,%ebx
 	incl %ebx
 
-3:
 #endif /* CONFIG_SMP */
+3:
 
 /*
  * Enable paging
@@ -309,7 +319,7 @@ is386:	movl $2,%ecx		# set MP
 
 	call check_x87
 	call setup_pda
-	lgdt cpu_gdt_descr
+	lgdt early_gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
@@ -319,12 +329,12 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%ds
 	movl %eax,%es
 
-	xorl %eax,%eax			# Clear FS and LDT
-	movl %eax,%fs
+	xorl %eax,%eax			# Clear GS and LDT
+	movl %eax,%gs
 	lldt %ax
 
 	movl $(__KERNEL_PDA),%eax
-	mov  %eax,%gs
+	mov  %eax,%fs
 
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
@@ -360,12 +370,12 @@ check_x87:
  * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
  * that CPU's GDT and PDA.
  */
-setup_pda:
+ENTRY(setup_pda)
 	/* get the PDA pointer */
 	movl start_pda, %eax
 
 	/* slot the PDA address into the GDT */
-	mov cpu_gdt_descr+2, %ecx
+	mov early_gdt_descr+2, %ecx
 	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
 	shr $16, %eax
 	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
@@ -492,6 +502,7 @@ ignore_int:
 #endif
 	iret
 
+.section .text
 #ifdef CONFIG_PARAVIRT
 startup_paravirt:
 	cld
@@ -502,10 +513,11 @@ startup_paravirt:
 	pushl	%ecx
 	pushl	%eax
 
-	/* paravirt.o is last in link, and that probe fn never returns */
 	pushl	$__start_paravirtprobe
 1:
 	movl	0(%esp), %eax
+	cmpl	$__stop_paravirtprobe, %eax
+	je	unhandled_paravirt
 	pushl	(%eax)
 	movl	8(%esp), %eax
 	call	*(%esp)
@@ -517,6 +529,10 @@ startup_paravirt:
 
 	addl	$4, (%esp)
 	jmp	1b
+
+unhandled_paravirt:
+	/* Nothing wanted us: we're screwed. */
+	ud2
 #endif
 
 /*
@@ -581,7 +597,7 @@ idt_descr:
 
 # boot GDT descriptor (later on used by CPU#0):
 	.word 0				# 32 bit align gdt_desc.address
-ENTRY(cpu_gdt_descr)
+ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
 	.long cpu_gdt_table
 
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 45a8685bb60b..e1006b7acc9e 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -1,4 +1,5 @@
 #include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/errno.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -6,17 +7,278 @@
 #include <asm/hpet.h>
 #include <asm/io.h>
 
+extern struct clock_event_device *global_clock_event;
+
 #define HPET_MASK	CLOCKSOURCE_MASK(32)
 #define HPET_SHIFT	22
 
 /* FSEC = 10^-15 NSEC = 10^-9 */
 #define FSEC_PER_NSEC	1000000
 
-static void *hpet_ptr;
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+static void __iomem * hpet_virt_address;
+
+static inline unsigned long hpet_readl(unsigned long a)
+{
+	return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned long d, unsigned long a)
+{
+	writel(d, hpet_virt_address + a);
+}
+
+/*
+ * HPET command line enable / disable
+ */
+static int boot_hpet_disable;
+
+static int __init hpet_setup(char* str)
+{
+	if (str) {
+		if (!strncmp("disable", str, 7))
+			boot_hpet_disable = 1;
+	}
+	return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static inline int is_hpet_capable(void)
+{
+	return (!boot_hpet_disable && hpet_address);
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static int hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+	return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+static void hpet_reserve_platform_timers(unsigned long id)
+{
+	struct hpet __iomem *hpet = hpet_virt_address;
+	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+	unsigned int nrtimers, i;
+	struct hpet_data hd;
+
+	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+	memset(&hd, 0, sizeof (hd));
+	hd.hd_phys_address = hpet_address;
+	hd.hd_address = hpet_virt_address;
+	hd.hd_nirqs = nrtimers;
+	hd.hd_flags = HPET_DATA_PLATFORM;
+	hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+	hpet_reserve_timer(&hd, 1);
+#endif
+
+	hd.hd_irq[0] = HPET_LEGACY_8254;
+	hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+	for (i = 2; i < nrtimers; timer++, i++)
+		hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+			Tn_INT_ROUTE_CNF_SHIFT;
+
+	hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned long id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_period;
+
+static void hpet_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt);
+static int hpet_next_event(unsigned long delta,
+			   struct clock_event_device *evt);
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name		= "hpet",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.set_mode	= hpet_set_mode,
+	.set_next_event = hpet_next_event,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+static void hpet_start_counter(void)
+{
+	unsigned long cfg = hpet_readl(HPET_CFG);
+
+	cfg &= ~HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+	hpet_writel(0, HPET_COUNTER);
+	hpet_writel(0, HPET_COUNTER + 4);
+	cfg |= HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_enable_int(void)
+{
+	unsigned long cfg = hpet_readl(HPET_CFG);
+
+	cfg |= HPET_CFG_LEGACY;
+	hpet_writel(cfg, HPET_CFG);
+	hpet_legacy_int_enabled = 1;
+}
+
+static void hpet_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt)
+{
+	unsigned long cfg, cmp, now;
+	uint64_t delta;
+
+	switch(mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
+		delta >>= hpet_clockevent.shift;
+		now = hpet_readl(HPET_COUNTER);
+		cmp = now + (unsigned long) delta;
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
+		       HPET_TN_SETVAL | HPET_TN_32BIT;
+		hpet_writel(cfg, HPET_T0_CFG);
+		/*
+		 * The first write after writing TN_SETVAL to the
+		 * config register sets the counter value, the second
+		 * write sets the period.
+		 */
+		hpet_writel(cmp, HPET_T0_CMP);
+		udelay(1);
+		hpet_writel((unsigned long) delta, HPET_T0_CMP);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg &= ~HPET_TN_PERIODIC;
+		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+		hpet_writel(cfg, HPET_T0_CFG);
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T0_CFG);
+		break;
+	}
+}
+
+static int hpet_next_event(unsigned long delta,
+			   struct clock_event_device *evt)
+{
+	unsigned long cnt;
+
+	cnt = hpet_readl(HPET_COUNTER);
+	cnt += delta;
+	hpet_writel(cnt, HPET_T0_CMP);
+
+	return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0);
+}
+
+/*
+ * Try to setup the HPET timer
+ */
+int __init hpet_enable(void)
+{
+	unsigned long id;
+	uint64_t hpet_freq;
+
+	if (!is_hpet_capable())
+		return 0;
+
+	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+
+	/*
+	 * Read the period and check for a sane value:
+	 */
+	hpet_period = hpet_readl(HPET_PERIOD);
+	if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+		goto out_nohpet;
+
+	/*
+	 * The period is a femto seconds value. We need to calculate the
+	 * scaled math multiplication factor for nanosecond to hpet tick
+	 * conversion.
+	 */
+	hpet_freq = 1000000000000000ULL;
+	do_div(hpet_freq, hpet_period);
+	hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
+				      NSEC_PER_SEC, 32);
+	/* Calculate the min / max delta */
+	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+							   &hpet_clockevent);
+	hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
+							   &hpet_clockevent);
+
+	/*
+	 * Read the HPET ID register to retrieve the IRQ routing
+	 * information and the number of channels
+	 */
+	id = hpet_readl(HPET_ID);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+	/*
+	 * The legacy routing mode needs at least two channels, tick timer
+	 * and the rtc emulation channel.
+	 */
+	if (!(id & HPET_ID_NUMBER))
+		goto out_nohpet;
+#endif
+
+	/* Start the counter */
+	hpet_start_counter();
+
+	if (id & HPET_ID_LEGSUP) {
+		hpet_enable_int();
+		hpet_reserve_platform_timers(id);
+		/*
+		 * Start hpet with the boot cpu mask and make it
+		 * global after the IO_APIC has been initialized.
+		 */
+		hpet_clockevent.cpumask =cpumask_of_cpu(0);
+		clockevents_register_device(&hpet_clockevent);
+		global_clock_event = &hpet_clockevent;
+		return 1;
+	}
+	return 0;
 
+out_nohpet:
+	iounmap(hpet_virt_address);
+	hpet_virt_address = NULL;
+	return 0;
+}
+
+/*
+ * Clock source related code
+ */
 static cycle_t read_hpet(void)
 {
-	return (cycle_t)readl(hpet_ptr);
+	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
 
 static struct clocksource clocksource_hpet = {
@@ -24,29 +286,17 @@ static struct clocksource clocksource_hpet = {
 	.rating		= 250,
 	.read		= read_hpet,
 	.mask		= HPET_MASK,
-	.mult		= 0, /* set below */
 	.shift		= HPET_SHIFT,
-	.is_continuous	= 1,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 static int __init init_hpet_clocksource(void)
 {
-	unsigned long hpet_period;
-	void __iomem* hpet_base;
 	u64 tmp;
-	int err;
 
-	if (!is_hpet_enabled())
+	if (!hpet_virt_address)
 		return -ENODEV;
 
-	/* calculate the hpet address: */
-	hpet_base =
-		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-	hpet_ptr = hpet_base + HPET_COUNTER;
-
-	/* calculate the frequency: */
-	hpet_period = readl(hpet_base + HPET_PERIOD);
-
 	/*
 	 * hpet period is in femto seconds per cycle
 	 * so we need to convert this to ns/cyc units
@@ -62,11 +312,218 @@ static int __init init_hpet_clocksource(void)
 	do_div(tmp, FSEC_PER_NSEC);
 	clocksource_hpet.mult = (u32)tmp;
 
-	err = clocksource_register(&clocksource_hpet);
-	if (err)
-		iounmap(hpet_base);
-
-	return err;
+	return clocksource_register(&clocksource_hpet);
 }
 
 module_init(init_hpet_clocksource);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ	64
+#define DEFAULT_RTC_SHIFT	6
+#define RTC_NUM_INTS		1
+
+static unsigned long hpet_rtc_flags;
+static unsigned long hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static unsigned long hpet_t1_cmp;
+static unsigned long hpet_default_delta;
+static unsigned long hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+	unsigned long cfg, cnt, delta, flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (!hpet_default_delta) {
+		uint64_t clc;
+
+		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+		clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+		hpet_default_delta = (unsigned long) clc;
+	}
+
+	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+		delta = hpet_default_delta;
+	else
+		delta = hpet_pie_delta;
+
+	local_irq_save(flags);
+
+	cnt = delta + hpet_readl(HPET_COUNTER);
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	local_irq_restore(flags);
+
+	return 1;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_rtc_flags &= ~bit_mask;
+	return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+	unsigned long oldbits = hpet_rtc_flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_rtc_flags |= bit_mask;
+
+	if (!oldbits)
+		hpet_rtc_timer_init();
+
+	return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+			unsigned char sec)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_alarm_time.tm_hour = hrs;
+	hpet_alarm_time.tm_min = min;
+	hpet_alarm_time.tm_sec = sec;
+
+	return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+	uint64_t clc;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (freq <= DEFAULT_RTC_INT_FREQ)
+		hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+	else {
+		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+		do_div(clc, freq);
+		clc >>= hpet_clockevent.shift;
+		hpet_pie_delta = (unsigned long) clc;
+	}
+	return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+	return is_hpet_enabled();
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+	unsigned long cfg, delta;
+	int lost_ints = -1;
+
+	if (unlikely(!hpet_rtc_flags)) {
+		cfg = hpet_readl(HPET_T1_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T1_CFG);
+		return;
+	}
+
+	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+		delta = hpet_default_delta;
+	else
+		delta = hpet_pie_delta;
+
+	/*
+	 * Increment the comparator value until we are ahead of the
+	 * current count.
+	 */
+	do {
+		hpet_t1_cmp += delta;
+		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+		lost_ints++;
+	} while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+
+	if (lost_ints) {
+		if (hpet_rtc_flags & RTC_PIE)
+			hpet_pie_count += lost_ints;
+		if (printk_ratelimit())
+			printk(KERN_WARNING "rtc: lost %d interrupts\n",
+				lost_ints);
+	}
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+	struct rtc_time curr_time;
+	unsigned long rtc_int_flag = 0;
+
+	hpet_rtc_timer_reinit();
+
+	if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+		rtc_get_rtc_time(&curr_time);
+
+	if (hpet_rtc_flags & RTC_UIE &&
+	    curr_time.tm_sec != hpet_prev_update_sec) {
+		rtc_int_flag = RTC_UF;
+		hpet_prev_update_sec = curr_time.tm_sec;
+	}
+
+	if (hpet_rtc_flags & RTC_PIE &&
+	    ++hpet_pie_count >= hpet_pie_limit) {
+		rtc_int_flag |= RTC_PF;
+		hpet_pie_count = 0;
+	}
+
+	if (hpet_rtc_flags & RTC_PIE &&
+	    (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+	    (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+	    (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+			rtc_int_flag |= RTC_AF;
+
+	if (rtc_int_flag) {
+		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		rtc_interrupt(rtc_int_flag, dev_id);
+	}
+	return IRQ_HANDLED;
+}
+#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 9a0060b92e32..a6bc7bb38834 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -2,7 +2,7 @@
  * i8253.c  8253/PIT functions
  *
  */
-#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysdev.h>
@@ -19,17 +19,97 @@
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
-void setup_pit_timer(void)
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+			   struct clock_event_device *evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	switch(mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		udelay(10);
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_UNUSED:
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+		break;
+	}
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
-	udelay(10);
-	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-	udelay(10);
-	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
+	outb(delta >> 8 , PIT_CH0);	/* MSB */
 	spin_unlock_irqrestore(&i8253_lock, flags);
+
+	return 0;
+}
+
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ *
+ * The profiling and update capabilites are switched off once the local apic is
+ * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
+ * !using_apic_timer decisions in do_timer_interrupt_hook()
+ */
+struct clock_event_device pit_clockevent = {
+	.name		= "pit",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init setup_pit_timer(void)
+{
+	/*
+	 * Start pit with the boot cpu mask and make it global after the
+	 * IO_APIC has been initialized.
+	 */
+	pit_clockevent.cpumask = cpumask_of_cpu(0);
+	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	clockevents_register_device(&pit_clockevent);
+	global_clock_event = &pit_clockevent;
 }
 
 /*
@@ -46,7 +126,7 @@ static cycle_t pit_read(void)
 	static u32 old_jifs;
 
 	spin_lock_irqsave(&i8253_lock, flags);
-        /*
+	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
 	 * by having side effects on state that we cannot undo if
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index c8d45821c788..03abfdb1a6e4 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int);
 static struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
 	.mask		= disable_8259A_irq,
+	.disable	= disable_8259A_irq,
 	.unmask		= enable_8259A_irq,
 	.mask_ack	= mask_and_ack_8259A,
 };
@@ -410,12 +411,6 @@ void __init native_init_IRQ(void)
 	intr_init_hook();
 
 	/*
-	 * Set the clock to HZ Hz, we already have a valid
-	 * vector now:
-	 */
-	setup_pit_timer();
-
-	/*
 	 * External FPU? Set up irq13 if so, for
 	 * original braindamaged IBM FERR coupling.
 	 */
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6a3875f81a0a..4ccebd454e25 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -126,7 +126,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
  */
 static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
-	volatile struct io_apic *io_apic = io_apic_base(apic);
+	volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
 	if (sis_apic_bug)
 		writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
@@ -482,8 +482,8 @@ static void do_irq_balance(void)
 		package_index = CPU_TO_PACKAGEINDEX(i);
 		for (j = 0; j < NR_IRQS; j++) {
 			unsigned long value_now, delta;
-			/* Is this an active IRQ? */
-			if (!irq_desc[j].action)
+			/* Is this an active IRQ or balancing disabled ? */
+			if (!irq_desc[j].action || irq_balancing_disabled(j))
 				continue;
 			if ( package_index == i )
 				IRQ_DELTA(package_index,j) = 0;
@@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 			trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_fasteoi_irq, "fasteoi");
-	else {
-		irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
+	else
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_edge_irq, "edge");
-	}
 	set_intr_gate(vector, interrupt[irq]);
 }
 
@@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	v = apic_read(APIC_TASKPRI);
 	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1920,7 +1918,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
 static void __init setup_ioapic_ids_from_mpc(void) { }
 #endif
 
-static int no_timer_check __initdata;
+int no_timer_check __initdata;
 
 static int __init notimercheck(char *s)
 {
@@ -2310,7 +2308,7 @@ static inline void __init check_timer(void)
 
 	disable_8259A_irq(0);
 	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
-				      "fasteio");
+				      "fasteoi");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2606,25 +2604,32 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
 	struct msi_msg msg;
-	int ret;
+	int irq, ret;
+	irq = create_irq();
+	if (irq < 0)
+		return irq;
+
+	set_irq_msi(irq, desc);
 	ret = msi_compose_msg(dev, irq, &msg);
-	if (ret < 0)
+	if (ret < 0) {
+		destroy_irq(irq);
 		return ret;
+	}
 
 	write_msi_msg(irq, &msg);
 
 	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
 				      "edge");
 
-	return 0;
+	return irq;
 }
 
 void arch_teardown_msi_irq(unsigned int irq)
 {
-	return;
+	destroy_irq(irq);
 }
 
 #endif /* CONFIG_PCI_MSI */
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3201d421090a..0f2ca590bf23 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -10,7 +10,6 @@
  * io_apic.c.)
  */
 
-#include <asm/uaccess.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
@@ -19,19 +18,36 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 
+#include <asm/idle.h>
+
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
-#ifndef CONFIG_X86_LOCAL_APIC
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
  */
 void ack_bad_irq(unsigned int irq)
 {
-	printk("unexpected IRQ trap at vector %02x\n", irq);
-}
+	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * Currently unexpected vectors happen only on SMP and APIC.
+	 * We _must_ ack these because every local APIC has only N
+	 * irq slots per priority level, and a 'hanging, unacked' IRQ
+	 * holds up an irq slot - in excessive cases (when multiple
+	 * unexpected vectors occur) that might lock up the APIC
+	 * completely.
+	 * But only ack when the APIC is enabled -AK
+	 */
+	if (cpu_has_apic)
+		ack_APIC_irq();
 #endif
+}
 
 #ifdef CONFIG_4KSTACKS
 /*
@@ -61,6 +77,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
 #endif
+	exit_idle();
 
 	if (unlikely((unsigned)irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index af1d53344993..b545bc746fce 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
 			"	pushf\n"
 			/* skip cs, eip, orig_eax */
 			"	subl $12, %esp\n"
-			"	pushl %gs\n"
+			"	pushl %fs\n"
 			"	pushl %ds\n"
 			"	pushl %es\n"
 			"	pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* skip eip, orig_eax, es, ds, gs */
+			/* skip eip, orig_eax, es, ds, fs */
 			"	addl $20, %esp\n"
 			"	popf\n"
 			"	ret\n");
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
 	/* fixup registers */
-	regs->xcs = __KERNEL_CS;
+	regs->xcs = __KERNEL_CS | get_kernel_rpl();
 	regs->eip = trampoline_address;
 	regs->orig_eax = 0xffffffff;
 
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index c8fa13721bcb..b8f16633a6ec 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -384,7 +384,7 @@ static int do_microcode_update (void)
 {
 	long cursor = 0;
 	int error = 0;
-	void *new_mc;
+	void *new_mc = NULL;
 	int cpu;
 	cpumask_t old;
 
@@ -451,7 +451,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 	return ret;
 }
 
-static struct file_operations microcode_fops = {
+static const struct file_operations microcode_fops = {
 	.owner		= THIS_MODULE,
 	.write		= microcode_write,
 	.open		= microcode_open,
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 49bff3596bff..4f5983c98669 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -1057,7 +1057,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 	static int		gsi_to_irq[MAX_GSI_NUM];
 
 	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_fadt.sci_int == gsi)
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
 		return gsi;
 
 	ioapic = mp_find_ioapic(gsi);
@@ -1114,7 +1114,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 			/*
 			 * Don't assign IRQ used by ACPI SCI
 			 */
-			if (gsi == acpi_fadt.sci_int)
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
 				gsi = pci_irq++;
 			gsi_to_irq[irq] = gsi;
 		} else {
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 4a472a17d1c6..bcaa6e9b6197 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
 #ifdef CONFIG_SMP
 
 struct msr_command {
-	int cpu;
 	int err;
 	u32 reg;
 	u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block)
 {
 	struct msr_command *cmd = (struct msr_command *)cmd_block;
 
-	if (cmd->cpu == smp_processor_id())
-		cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
+	cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
 }
 
 static void msr_smp_rdmsr(void *cmd_block)
 {
 	struct msr_command *cmd = (struct msr_command *)cmd_block;
 
-	if (cmd->cpu == smp_processor_id())
-		cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
+	cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
 }
 
 static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
 	if (cpu == smp_processor_id()) {
 		ret = wrmsr_eio(reg, eax, edx);
 	} else {
-		cmd.cpu = cpu;
 		cmd.reg = reg;
 		cmd.data[0] = eax;
 		cmd.data[1] = edx;
 
-		smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
+		smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1);
 		ret = cmd.err;
 	}
 	preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
 	if (cpu == smp_processor_id()) {
 		ret = rdmsr_eio(reg, eax, edx);
 	} else {
-		cmd.cpu = cpu;
 		cmd.reg = reg;
 
-		smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
+		smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1);
 
 		*eax = cmd.data[0];
 		*edx = cmd.data[1];
@@ -230,7 +225,7 @@ static int msr_open(struct inode *inode, struct file *file)
 /*
  * File operations we support
  */
-static struct file_operations msr_fops = {
+static const struct file_operations msr_fops = {
 	.owner = THIS_MODULE,
 	.llseek = msr_seek,
 	.read = msr_read,
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1a6f8bb8881c..821df34d2b3a 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -23,6 +23,7 @@
 #include <linux/dmi.h>
 #include <linux/kprobes.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -185,7 +186,8 @@ static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
-		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
+			|| (boot_cpu_data.x86 == 16));
 	case X86_VENDOR_INTEL:
 		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 			return 1;
@@ -216,6 +218,28 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+	u64 counter_val;
+	unsigned int retval = hz;
+
+	/*
+	 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+	 * are writable, with higher bits sign extending from bit 31.
+	 * So, we can only program the counter with 31 bit values and
+	 * 32nd bit should be 1, for 33.. to be 1.
+	 * Find the appropriate nmi_hz
+	 */
+	counter_val = (u64)cpu_khz * 1000;
+	do_div(counter_val, retval);
+ 	if (counter_val > 0x7fffffffULL) {
+		u64 count = (u64)cpu_khz * 1000;
+		do_div(count, 0x7fffffffUL);
+		retval = count + 1;
+	}
+	return retval;
+}
+
 static int __init check_nmi_watchdog(void)
 {
 	unsigned int *prev_nmi_count;
@@ -281,18 +305,10 @@ static int __init check_nmi_watchdog(void)
 		struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
 		nmi_hz = 1;
-		/*
-		 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
-		 * are writable, with higher bits sign extending from bit 31.
-		 * So, we can only program the counter with 31 bit values and
-		 * 32nd bit should be 1, for 33.. to be 1.
-		 * Find the appropriate nmi_hz
-		 */
-	 	if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
-			((u64)cpu_khz * 1000) > 0x7fffffffULL) {
-			u64 count = (u64)cpu_khz * 1000;
-			do_div(count, 0x7fffffffUL);
-			nmi_hz = count + 1;
+
+		if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+		    wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+			nmi_hz = adjust_for_32bit_ctr(nmi_hz);
 		}
 	}
 
@@ -369,6 +385,34 @@ void enable_timer_nmi_watchdog(void)
 	}
 }
 
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
@@ -442,6 +486,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
 	wrmsrl(perfctr_msr, 0 - count);
 }
 
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+		const char *descr)
+{
+	u64 count = (u64)cpu_khz * 1000;
+
+	do_div(count, nmi_hz);
+	if(descr)
+		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+	wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
 /* Note that these events don't tick when the CPU idles. This means
    the frequency varies with CPU load. */
 
@@ -531,7 +586,8 @@ static int setup_p6_watchdog(void)
 
 	/* setup the timer */
 	wrmsr(evntsel_msr, evntsel, 0);
-	write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= P6_EVNTSEL0_ENABLE;
 	wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +760,8 @@ static int setup_intel_arch_watchdog(void)
 
 	/* setup the timer */
 	wrmsr(evntsel_msr, evntsel, 0);
-	write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 	wrmsr(evntsel_msr, evntsel, 0);
@@ -762,7 +819,8 @@ void setup_apic_nmi_watchdog (void *unused)
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_AMD:
-			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+				boot_cpu_data.x86 != 16)
 				return;
 			if (!setup_k7_watchdog())
 				return;
@@ -916,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		cpu_clear(cpu, backtrace_mask);
 	}
 
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+	/*
+	 * Take the local apic timer and PIT/HPET into account. We don't
+	 * know which one is active, when we have highres/dyntick on
+	 */
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
 
-	/* if the apic timer isn't firing, this cpu isn't doing much */
+	/* if the none of the timers isn't firing, this cpu isn't doing much */
 	if (!touched && last_irq_sums[cpu] == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
@@ -956,6 +1018,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 				dummy &= ~P4_CCCR_OVF;
 	 			wrmsrl(wd->cccr_msr, dummy);
 	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
+				/* start the cycle over again */
+				write_watchdog_counter(wd->perfctr_msr, NULL);
 	 		}
 			else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
 				 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +1028,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 				 * other P6 variant.
 				 * ArchPerfom/Core Duo also needs this */
 				apic_write(APIC_LVTPC, APIC_DM_NMI);
+				/* P6/ARCH_PERFMON has 32 bit counter write */
+				write_watchdog_counter32(wd->perfctr_msr, NULL);
+			} else {
+				/* start the cycle over again */
+				write_watchdog_counter(wd->perfctr_msr, NULL);
 			}
-			/* start the cycle over again */
-			write_watchdog_counter(wd->perfctr_msr, NULL);
 			rc = 1;
 		} else if (nmi_watchdog == NMI_IO_APIC) {
 			/* don't know how to accurately check for this.
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f5..c156ecfa3872 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
 	return insn_len;
 }
 
-static fastcall unsigned long native_get_debugreg(int regno)
+static unsigned long native_get_debugreg(int regno)
 {
 	unsigned long val = 0; 	/* Damn you, gcc! */
 
@@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno)
 	return val;
 }
 
-static fastcall void native_set_debugreg(int regno, unsigned long value)
+static void native_set_debugreg(int regno, unsigned long value)
 {
 	switch (regno) {
 	case 0:
@@ -146,55 +146,55 @@ void init_IRQ(void)
 	paravirt_ops.init_IRQ();
 }
 
-static fastcall void native_clts(void)
+static void native_clts(void)
 {
 	asm volatile ("clts");
 }
 
-static fastcall unsigned long native_read_cr0(void)
+static unsigned long native_read_cr0(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr0(unsigned long val)
+static void native_write_cr0(unsigned long val)
 {
 	asm volatile("movl %0,%%cr0": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr2(void)
+static unsigned long native_read_cr2(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr2(unsigned long val)
+static void native_write_cr2(unsigned long val)
 {
 	asm volatile("movl %0,%%cr2": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr3(void)
+static unsigned long native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr3(unsigned long val)
+static void native_write_cr3(unsigned long val)
 {
 	asm volatile("movl %0,%%cr3": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr4(void)
+static unsigned long native_read_cr4(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall unsigned long native_read_cr4_safe(void)
+static unsigned long native_read_cr4_safe(void)
 {
 	unsigned long val;
 	/* This could fault if %cr4 does not exist */
@@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void)
 	return val;
 }
 
-static fastcall void native_write_cr4(unsigned long val)
+static void native_write_cr4(unsigned long val)
 {
 	asm volatile("movl %0,%%cr4": :"r" (val));
 }
 
-static fastcall unsigned long native_save_fl(void)
+static unsigned long native_save_fl(void)
 {
 	unsigned long f;
 	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
 	return f;
 }
 
-static fastcall void native_restore_fl(unsigned long f)
+static void native_restore_fl(unsigned long f)
 {
 	asm volatile("pushl %0 ; popfl": /* no output */
 			     :"g" (f)
 			     :"memory", "cc");
 }
 
-static fastcall void native_irq_disable(void)
+static void native_irq_disable(void)
 {
 	asm volatile("cli": : :"memory");
 }
 
-static fastcall void native_irq_enable(void)
+static void native_irq_enable(void)
 {
 	asm volatile("sti": : :"memory");
 }
 
-static fastcall void native_safe_halt(void)
+static void native_safe_halt(void)
 {
 	asm volatile("sti; hlt": : :"memory");
 }
 
-static fastcall void native_halt(void)
+static void native_halt(void)
 {
 	asm volatile("hlt": : :"memory");
 }
 
-static fastcall void native_wbinvd(void)
+static void native_wbinvd(void)
 {
 	asm volatile("wbinvd": : :"memory");
 }
 
-static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+static unsigned long long native_read_msr(unsigned int msr, int *err)
 {
 	unsigned long long val;
 
@@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
 	return val;
 }
 
-static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+static int native_write_msr(unsigned int msr, unsigned long long val)
 {
 	int err;
 	asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
 	return err;
 }
 
-static fastcall unsigned long long native_read_tsc(void)
+static unsigned long long native_read_tsc(void)
 {
 	unsigned long long val;
 	asm volatile("rdtsc" : "=A" (val));
 	return val;
 }
 
-static fastcall unsigned long long native_read_pmc(void)
+static unsigned long long native_read_pmc(void)
 {
 	unsigned long long val;
 	asm volatile("rdpmc" : "=A" (val));
 	return val;
 }
 
-static fastcall void native_load_tr_desc(void)
+static void native_load_tr_desc(void)
 {
 	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
 }
 
-static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+static void native_load_gdt(const struct Xgt_desc_struct *dtr)
 {
 	asm volatile("lgdt %0"::"m" (*dtr));
 }
 
-static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+static void native_load_idt(const struct Xgt_desc_struct *dtr)
 {
 	asm volatile("lidt %0"::"m" (*dtr));
 }
 
-static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+static void native_store_gdt(struct Xgt_desc_struct *dtr)
 {
 	asm ("sgdt %0":"=m" (*dtr));
 }
 
-static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+static void native_store_idt(struct Xgt_desc_struct *dtr)
 {
 	asm ("sidt %0":"=m" (*dtr));
 }
 
-static fastcall unsigned long native_store_tr(void)
+static unsigned long native_store_tr(void)
 {
 	unsigned long tr;
 	asm ("str %0":"=r" (tr));
 	return tr;
 }
 
-static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+static void native_load_tls(struct thread_struct *t, unsigned int cpu)
 {
 #define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
 	C(0); C(1); C(2);
@@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32
 	lp[1] = entry_high;
 }
 
-static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_load_esp0(struct tss_struct *tss,
+static void native_load_esp0(struct tss_struct *tss,
 				      struct thread_struct *thread)
 {
 	tss->esp0 = thread->esp0;
@@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss,
 	}
 }
 
-static fastcall void native_io_delay(void)
+static void native_io_delay(void)
 {
 	asm volatile("outb %al,$0x80");
 }
 
-static fastcall void native_flush_tlb(void)
+static void native_flush_tlb(void)
 {
 	__native_flush_tlb();
 }
@@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void)
  * Global pages have to be flushed a bit differently. Not a real
  * performance problem because this does not happen often.
  */
-static fastcall void native_flush_tlb_global(void)
+static void native_flush_tlb_global(void)
 {
 	__native_flush_tlb_global();
 }
 
-static fastcall void native_flush_tlb_single(u32 addr)
+static void native_flush_tlb_single(u32 addr)
 {
 	__native_flush_tlb_single(addr);
 }
 
 #ifndef CONFIG_X86_PAE
-static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+static void native_set_pte(pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
 }
 
-static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
 }
 
-static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
 }
 
 #else /* CONFIG_X86_PAE */
 
-static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+static void native_set_pte(pte_t *ptep, pte_t pte)
 {
 	ptep->pte_high = pte.pte_high;
 	smp_wmb();
 	ptep->pte_low = pte.pte_low;
 }
 
-static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
 {
 	ptep->pte_high = pte.pte_high;
 	smp_wmb();
 	ptep->pte_low = pte.pte_low;
 }
 
-static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
 {
 	ptep->pte_low = 0;
 	smp_wmb();
@@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long
 	ptep->pte_low = pte.pte_low;
 }
 
-static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
 {
 	set_64bit((unsigned long long *)ptep,pte_val(pteval));
 }
 
-static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
 }
 
-static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+static void native_set_pud(pud_t *pudp, pud_t pudval)
 {
 	*pudp = pudval;
 }
 
-static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	ptep->pte_low = 0;
 	smp_wmb();
 	ptep->pte_high = 0;
 }
 
-static fastcall void native_pmd_clear(pmd_t *pmd)
+static void native_pmd_clear(pmd_t *pmd)
 {
 	u32 *tmp = (u32 *)pmd;
 	*tmp = 0;
@@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd)
 #endif /* CONFIG_X86_PAE */
 
 /* These are in entry.S */
-extern fastcall void native_iret(void);
-extern fastcall void native_irq_enable_sysexit(void);
+extern void native_iret(void);
+extern void native_irq_enable_sysexit(void);
 
 static int __init print_banner(void)
 {
@@ -482,9 +482,6 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
 
-/* We simply declare start_kernel to be the paravirt probe of last resort. */
-paravirt_probe(start_kernel);
-
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
@@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = {
 	.apic_write = native_apic_write,
 	.apic_write_atomic = native_apic_write_atomic,
 	.apic_read = native_apic_read,
+	.setup_boot_clock = setup_boot_APIC_clock,
+	.setup_secondary_clock = setup_secondary_APIC_clock,
 #endif
+	.set_lazy_mode = (void *)native_nop,
 
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
 
+	.alloc_pt = (void *)native_nop,
+	.alloc_pd = (void *)native_nop,
+	.alloc_pd_clone = (void *)native_nop,
+	.release_pt = (void *)native_nop,
+	.release_pd = (void *)native_nop,
+
 	.set_pte = native_set_pte,
 	.set_pte_at = native_set_pte_at,
 	.set_pmd = native_set_pmd,
@@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = {
 
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
+
+	.startup_ipi_hook = (void *)native_nop,
 };
 
 /*
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+	struct platform_device *pd;
+	int ret;
+
+	pd = platform_device_alloc("pcspkr", -1);
+	if (!pd)
+		return -ENOMEM;
+
+	ret = platform_device_add(pd);
+	if (ret)
+		platform_device_put(pd);
+
+	return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index c641056233a6..bea304d48cdb 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -38,6 +38,7 @@
 #include <linux/ptrace.h>
 #include <linux/random.h>
 #include <linux/personality.h>
+#include <linux/tick.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -48,6 +49,7 @@
 #include <asm/i387.h>
 #include <asm/desc.h>
 #include <asm/vm86.h>
+#include <asm/idle.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -80,6 +82,42 @@ void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&idle_notifier, n);
+}
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+
+static DEFINE_PER_CPU(volatile unsigned long, idle_state);
+
+void enter_idle(void)
+{
+	/* needs to be atomic w.r.t. interrupts, not against other CPUs */
+	__set_bit(0, &__get_cpu_var(idle_state));
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+	/* needs to be atomic w.r.t. interrupts, not against other CPUs */
+	if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0)
+		return;
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+void exit_idle(void)
+{
+	if (current->pid)
+		return;
+	__exit_idle();
+}
+
 void disable_hlt(void)
 {
 	hlt_counter++;
@@ -130,6 +168,7 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle (void)
 {
+	local_irq_enable();
 	cpu_relax();
 }
 
@@ -173,6 +212,7 @@ void cpu_idle(void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
+		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
 			void (*idle)(void);
 
@@ -189,8 +229,18 @@ void cpu_idle(void)
 				play_dead();
 
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+
+			/*
+			 * Idle routines should keep interrupts disabled
+			 * from here on, until they go to idle.
+			 * Otherwise, idle callbacks can misfire.
+			 */
+			local_irq_disable();
+			enter_idle();
 			idle();
+			__exit_idle();
 		}
+		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
@@ -243,7 +293,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
-			__mwait(eax, ecx);
+			__sti_mwait(eax, ecx);
+		else
+			local_irq_enable();
+	} else {
+		local_irq_enable();
 	}
 }
 
@@ -308,8 +362,8 @@ void show_regs(struct pt_regs * regs)
 		regs->eax,regs->ebx,regs->ecx,regs->edx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
 		regs->esi, regs->edi, regs->ebp);
-	printk(" DS: %04x ES: %04x GS: %04x\n",
-	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
+	printk(" DS: %04x ES: %04x FS: %04x\n",
+	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
@@ -340,7 +394,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
-	regs.xgs = __KERNEL_PDA;
+	regs.xfs = __KERNEL_PDA;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +479,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 
 	p->thread.eip = (unsigned long) ret_from_fork;
 
-	savesegment(fs,p->thread.fs);
+	savesegment(gs,p->thread.gs);
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +555,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	dump->regs.eax = regs->eax;
 	dump->regs.ds = regs->xds;
 	dump->regs.es = regs->xes;
-	savesegment(fs,dump->regs.fs);
-	dump->regs.gs = regs->xgs;
+	dump->regs.fs = regs->xfs;
+	savesegment(gs,dump->regs.gs);
 	dump->regs.orig_eax = regs->orig_eax;
 	dump->regs.eip = regs->eip;
 	dump->regs.cs = regs->xcs;
@@ -653,7 +707,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	load_esp0(tss, next);
 
 	/*
-	 * Save away %fs. No need to save %gs, as it was saved on the
+	 * Save away %gs. No need to save %fs, as it was saved on the
 	 * stack on entry.  No need to save %es and %ds, as those are
 	 * always kernel segments while inside the kernel.  Doing this
 	 * before setting the new TLS descriptors avoids the situation
@@ -662,7 +716,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	 * used %fs or %gs (it does not today), or if the kernel is
 	 * running inside of a hypervisor layer.
 	 */
-	savesegment(fs, prev->fs);
+	savesegment(gs, prev->gs);
 
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
@@ -670,14 +724,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	load_TLS(next, cpu);
 
 	/*
-	 * Restore %fs if needed.
-	 *
-	 * Glibc normally makes %fs be zero.
+	 * Restore IOPL if needed.  In normal use, the flags restore
+	 * in the switch assembly will handle this.  But if the kernel
+	 * is running virtualized at a non-zero CPL, the popf will
+	 * not restore flags, so it must be done in a separate step.
 	 */
-	if (unlikely(prev->fs | next->fs))
-		loadsegment(fs, next->fs);
-
-	write_pda(pcurrent, next_p);
+	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+		set_iopl_mask(next->iopl);
 
 	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
@@ -688,6 +741,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 
 	disable_tsc(prev_p, next_p);
 
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_leave_lazy_cpu_mode();
+
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
@@ -695,6 +757,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (next_p->fpu_counter > 5)
 		math_state_restore();
 
+	/*
+	 * Restore %gs if needed (which is common)
+	 */
+	if (prev->gs | next->gs)
+		loadsegment(gs, next->gs);
+
+	write_pda(pcurrent, next_p);
+
 	return prev_p;
 }
 
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index af8aabe85800..4a8f8a259723 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *child,
 	unsigned long regno, unsigned long value)
 {
 	switch (regno >> 2) {
-		case FS:
+		case GS:
 			if (value && (value & 3) != 3)
 				return -EIO;
-			child->thread.fs = value;
+			child->thread.gs = value;
 			return 0;
 		case DS:
 		case ES:
-		case GS:
+		case FS:
 			if (value && (value & 3) != 3)
 				return -EIO;
 			value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *child,
 			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
 			break;
 	}
-	if (regno > ES*4)
+	if (regno > FS*4)
 		regno -= 1*4;
 	put_stack_long(child, regno, value);
 	return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child,
 	unsigned long retval = ~0UL;
 
 	switch (regno >> 2) {
-		case FS:
-			retval = child->thread.fs;
+		case GS:
+			retval = child->thread.gs;
 			break;
 		case DS:
 		case ES:
-		case GS:
+		case FS:
 		case SS:
 		case CS:
 			retval = 0xffff;
 			/* fall through */
 		default:
-			if (regno > ES*4)
+			if (regno > FS*4)
 				regno -= 1*4;
 			retval &= get_stack_long(child, regno);
 	}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4b31ad70c1ac..122623dcc6e1 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/seq_file.h>
-#include <linux/platform_device.h>
 #include <linux/console.h>
 #include <linux/mca.h>
 #include <linux/root_dev.h>
@@ -60,6 +59,7 @@
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
+#include <asm/vmi.h>
 #include <setup_arch.h>
 #include <bios_ebda.h>
 
@@ -132,7 +132,7 @@ unsigned long saved_videomode;
 #define RAMDISK_PROMPT_FLAG		0x8000
 #define RAMDISK_LOAD_FLAG		0x4000	
 
-static char command_line[COMMAND_LINE_SIZE];
+static char __initdata command_line[COMMAND_LINE_SIZE];
 
 unsigned char __initdata boot_params[PARAM_SIZE];
 
@@ -576,11 +576,19 @@ void __init setup_arch(char **cmdline_p)
 		print_memory_map("user");
 	}
 
-	strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
 	max_low_pfn = setup_memory();
 
+#ifdef CONFIG_VMI
+	/*
+	 * Must be after max_low_pfn is determined, and before kernel
+	 * pagetables are setup.
+	 */
+	vmi_init();
+#endif
+
 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
 	 * any memory using the bootmem allocator.  Although the
@@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	tsc_init();
 }
-
-static __init int add_pcspkr(void)
-{
-	struct platform_device *pd;
-	int ret;
-
-	pd = platform_device_alloc("pcspkr", -1);
-	if (!pd)
-		return -ENOMEM;
-
-	ret = platform_device_add(pd);
-	if (ret)
-		platform_device_put(pd);
-
-	return ret;
-}
-device_initcall(add_pcspkr);
-
-/*
- * Local Variables:
- * mode:c
- * c-file-style:"k&r"
- * c-basic-offset:8
- * End:
- */
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 65d7620eaa09..4f99e870c986 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <linux/ptrace.h>
 #include <linux/elf.h>
+#include <linux/binfmts.h>
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
@@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 			 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
 			 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
 
-	COPY_SEG(gs);
-	GET_SEG(fs);
+	GET_SEG(gs);
+	COPY_SEG(fs);
 	COPY_SEG(es);
 	COPY_SEG(ds);
 	COPY(edi);
@@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 {
 	int tmp, err = 0;
 
-	err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
-	savesegment(fs, tmp);
-	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+	err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+	savesegment(gs, tmp);
+	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
 
 	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
 	err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
@@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 			goto give_sigsegv;
 	}
 
-	restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+	if (current->binfmt->hasvdso)
+		restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+	else
+		restorer = (void *)&frame->retcode;
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 5285aff8367f..9bd9637ae692 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -23,6 +23,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#include <asm/idle.h>
 #include <mach_apic.h>
 
 /*
@@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	/*
 	 * i'm not happy about this global shared spinlock in the
 	 * MM hot path, but we'll see how contended it is.
-	 * Temporarily this turns IRQs off, so that lockups are
-	 * detected by the NMI watchdog.
+	 * AK: x86-64 has a faster method that could be ported.
 	 */
 	spin_lock(&tlbstate_lock);
 	
@@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 	/*
 	 * At this point the info structure may be out of scope unless wait==1
 	 */
+	exit_idle();
 	irq_enter();
 	(*func)(info);
 	irq_exit();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8c6c8c52b95c..48bfcaa13ecc 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
+#include <asm/vmi.h>
 
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
@@ -93,12 +94,6 @@ cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
-/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
- * is no way to resync one AP against BP. TBD: for prescott and above, we
- * should use IA64's algorithm
- */
-static int __devinitdata tsc_sync_disabled;
-
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
@@ -215,151 +210,6 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static struct {
-	atomic_t start_flag;
-	atomic_t count_start;
-	atomic_t count_stop;
-	unsigned long long values[NR_CPUS];
-} tsc __cpuinitdata = {
-	.start_flag = ATOMIC_INIT(0),
-	.count_start = ATOMIC_INIT(0),
-	.count_stop = ATOMIC_INIT(0),
-};
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp(void)
-{
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
-
-	atomic_set(&tsc.start_flag, 1);
-	wmb();
-
-	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
-	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc.count_start);
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		/*
-		 * We clear the TSC in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		/*
-		 * Wait for all APs to leave the synchronization point:
-		 */
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_start, 0);
-		wmb();
-		atomic_inc(&tsc.count_stop);
-	}
-
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc.values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc.values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long long realdelta;
-
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc.values[i] < avg)
-				realdelta = -realdelta;
-
-			if (realdelta)
-				printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
-					"skew, fixed it up.\n", i, realdelta);
-		}
-	}
-	if (!buggy)
-		printk("passed.\n");
-}
-
-static void __cpuinit synchronize_tsc_ap(void)
-{
-	int i;
-
-	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
-	 */
-	while (!atomic_read(&tsc.start_flag))
-		cpu_relax();
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc.count_start);
-		while (atomic_read(&tsc.count_start) != num_booting_cpus())
-			cpu_relax();
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		atomic_inc(&tsc.count_stop);
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus())
-			cpu_relax();
-	}
-}
-#undef NR_LOOPS
-
 extern void calibrate_delay(void);
 
 static atomic_t init_deasserted;
@@ -437,20 +287,12 @@ static void __cpuinit smp_callin(void)
 	/*
 	 * Save our processor parameters
 	 */
- 	smp_store_cpu_info(cpuid);
-
-	disable_APIC_timer();
+	smp_store_cpu_info(cpuid);
 
 	/*
 	 * Allow the master to continue.
 	 */
 	cpu_set(cpuid, cpu_callin_map);
-
-	/*
-	 *      Synchronize the TSC with the BP
-	 */
-	if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
-		synchronize_tsc_ap();
 }
 
 static int cpucount;
@@ -545,18 +387,25 @@ static void __cpuinit start_secondary(void *unused)
 	 * booting is too fragile that we want to limit the
 	 * things done here to the most necessary things.
 	 */
+#ifdef CONFIG_VMI
+	vmi_bringup();
+#endif
 	secondary_cpu_init();
 	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
-	setup_secondary_APIC_clock();
+	/*
+	 * Check TSC synchronization with the BP:
+	 */
+	check_tsc_sync_target();
+
+	setup_secondary_clock();
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
 		enable_NMI_through_LVT0(NULL);
 		enable_8259A_irq(0);
 	}
-	enable_APIC_timer();
 	/*
 	 * low-memory mappings have been cleared, flush them from
 	 * the local TLBs too.
@@ -619,7 +468,6 @@ extern struct {
 	unsigned short ss;
 } stack_start;
 extern struct i386_pda *start_pda;
-extern struct Xgt_desc_struct cpu_gdt_descr;
 
 #ifdef CONFIG_NUMA
 
@@ -749,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 	/*
 	 * Due to the Pentium erratum 3AP.
 	 */
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 	if (maxlvt > 3) {
 		apic_read_around(APIC_SPIV);
 		apic_write(APIC_ESR, 0);
@@ -835,11 +683,18 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		num_starts = 0;
 
 	/*
+	 * Paravirt / VMI wants a startup IPI hook here to set up the
+	 * target processor state.
+	 */
+	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+		         (unsigned long) stack_start.esp);
+
+	/*
 	 * Run STARTUP IPI loop.
 	 */
 	Dprintk("#startup loops: %d.\n", num_starts);
 
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	for (j = 1; j <= num_starts; j++) {
 		Dprintk("Sending STARTUP #%d.\n",j);
@@ -1115,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 	info.cpu = cpu;
 	INIT_WORK(&info.task, do_warm_boot_cpu);
 
-	tsc_sync_disabled = 1;
-
 	/* init low mem mapping */
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
 			min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1124,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 	schedule_work(&info.task);
 	wait_for_completion(&done);
 
-	tsc_sync_disabled = 0;
 	zap_low_mappings();
 	ret = 0;
 exit:
@@ -1320,13 +1172,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 
 	smpboot_setup_io_apic();
 
-	setup_boot_APIC_clock();
-
-	/*
-	 * Synchronize the TSC with the AP
-	 */
-	if (cpu_has_tsc && cpucount && cpu_khz)
-		synchronize_tsc_bp();
+	setup_boot_clock();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
@@ -1461,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	}
 
 	local_irq_enable();
+
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
+
+	/*
+	 * Check TSC synchronization with the AP:
+	 */
+	check_tsc_sync_source(cpu);
+
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
 
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index f7e735c077c3..2a8713ec0f9a 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -62,19 +62,19 @@ extern void * boot_ioremap(unsigned long, unsigned long);
 /* Identify CPU proximity domains */
 static void __init parse_cpu_affinity_structure(char *p)
 {
-	struct acpi_table_processor_affinity *cpu_affinity = 
-				(struct acpi_table_processor_affinity *) p;
+	struct acpi_srat_cpu_affinity *cpu_affinity =
+				(struct acpi_srat_cpu_affinity *) p;
 
-	if (!cpu_affinity->flags.enabled)
+	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;		/* empty entry */
 
 	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain);
+	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
 
-	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain;
+	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
 
 	printk("CPU 0x%02X in proximity domain 0x%02X\n",
-		cpu_affinity->apic_id, cpu_affinity->proximity_domain);
+		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
 }
 
 /*
@@ -84,28 +84,27 @@ static void __init parse_cpu_affinity_structure(char *p)
 static void __init parse_memory_affinity_structure (char *sratp)
 {
 	unsigned long long paddr, size;
-	unsigned long start_pfn, end_pfn; 
+	unsigned long start_pfn, end_pfn;
 	u8 pxm;
 	struct node_memory_chunk_s *p, *q, *pend;
-	struct acpi_table_memory_affinity *memory_affinity =
-			(struct acpi_table_memory_affinity *) sratp;
+	struct acpi_srat_mem_affinity *memory_affinity =
+			(struct acpi_srat_mem_affinity *) sratp;
 
-	if (!memory_affinity->flags.enabled)
+	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 		return;		/* empty entry */
 
+	pxm = memory_affinity->proximity_domain & 0xff;
+
 	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain);
+	BMAP_SET(pxm_bitmap, pxm);
 
 	/* calculate info for memory chunk structure */
-	paddr = memory_affinity->base_addr_hi;
-	paddr = (paddr << 32) | memory_affinity->base_addr_lo;
-	size = memory_affinity->length_hi;
-	size = (size << 32) | memory_affinity->length_lo;
-	
+	paddr = memory_affinity->base_address;
+	size = memory_affinity->length;
+
 	start_pfn = paddr >> PAGE_SHIFT;
 	end_pfn = (paddr + size) >> PAGE_SHIFT;
-	
-	pxm = memory_affinity->proximity_domain;
+
 
 	if (num_memory_chunks >= MAXCHUNKS) {
 		printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
@@ -132,8 +131,8 @@ static void __init parse_memory_affinity_structure (char *sratp)
 	printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
 		start_pfn, end_pfn,
 		memory_affinity->memory_type,
-		memory_affinity->proximity_domain,
-		(memory_affinity->flags.hot_pluggable ?
+		pxm,
+		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
 		 "enabled and removable" : "enabled" ) );
 }
 
@@ -185,10 +184,10 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	num_memory_chunks = 0;
 	while (p < end) {
 		switch (*p) {
-		case ACPI_SRAT_PROCESSOR_AFFINITY:
+		case ACPI_SRAT_TYPE_CPU_AFFINITY:
 			parse_cpu_affinity_structure(p);
 			break;
-		case ACPI_SRAT_MEMORY_AFFINITY:
+		case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
 			parse_memory_affinity_structure(p);
 			break;
 		default:
@@ -262,31 +261,30 @@ out_fail:
 	return 0;
 }
 
+struct acpi_static_rsdt {
+	struct acpi_table_rsdt table;
+	u32 padding[7]; /* Allow for 7 more table entries */
+};
+
 int __init get_memcfg_from_srat(void)
 {
 	struct acpi_table_header *header = NULL;
 	struct acpi_table_rsdp *rsdp = NULL;
 	struct acpi_table_rsdt *rsdt = NULL;
-	struct acpi_pointer *rsdp_address = NULL;
-	struct acpi_table_rsdt saved_rsdt;
+	acpi_native_uint rsdp_address = 0;
+	struct acpi_static_rsdt saved_rsdt;
 	int tables = 0;
 	int i = 0;
 
-	if (ACPI_FAILURE(acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING,
-						rsdp_address))) {
+	rsdp_address = acpi_find_rsdp();
+	if (!rsdp_address) {
 		printk("%s: System description tables not found\n",
 		       __FUNCTION__);
 		goto out_err;
 	}
 
-	if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) {
-		printk("%s: assigning address to rsdp\n", __FUNCTION__);
-		rsdp = (struct acpi_table_rsdp *)
-				(u32)rsdp_address->pointer.physical;
-	} else {
-		printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__);
-		goto out_err;
-	}
+	printk("%s: assigning address to rsdp\n", __FUNCTION__);
+	rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
 	if (!rsdp) {
 		printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
 		goto out_err;
@@ -295,13 +293,13 @@ int __init get_memcfg_from_srat(void)
 	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
 		rsdp->oem_id);
 
-	if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) {
+	if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
 		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
 		goto out_err;
 	}
 
 	rsdt = (struct acpi_table_rsdt *)
-	    boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt));
+	    boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
 
 	if (!rsdt) {
 		printk(KERN_WARNING
@@ -310,9 +308,9 @@ int __init get_memcfg_from_srat(void)
 		goto out_err;
 	}
 
-	header = & rsdt->header;
+	header = &rsdt->header;
 
-	if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) {
+	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
 		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
 		goto out_err;
 	}
@@ -330,9 +328,9 @@ int __init get_memcfg_from_srat(void)
 
 	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
 
-	if (saved_rsdt.header.length > sizeof(saved_rsdt)) {
+	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
 		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
-		       saved_rsdt.header.length);
+		       saved_rsdt.table.header.length);
 		goto out_err;
 	}
 
@@ -341,15 +339,15 @@ int __init get_memcfg_from_srat(void)
 	for (i = 0; i < tables; i++) {
 		/* Map in header, then map in full table length. */
 		header = (struct acpi_table_header *)
-			boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header));
+			boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
 		if (!header)
 			break;
 		header = (struct acpi_table_header *)
-			boot_ioremap(saved_rsdt.entry[i], header->length);
+			boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
 		if (!header)
 			break;
 
-		if (strncmp((char *) &header->signature, "SRAT", 4))
+		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
 			continue;
 
 		/* we've found the srat table. don't need to look at any more tables */
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 5da744204d10..13ca54a85a1c 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -70,14 +70,15 @@ void enable_sep_cpu(void)
  */
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
-static void *syscall_page;
+static struct page *syscall_pages[1];
 
 int __init sysenter_setup(void)
 {
-	syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	syscall_pages[0] = virt_to_page(syscall_page);
 
 #ifdef CONFIG_COMPAT_VDSO
-	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
 	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
 #endif
 
@@ -96,31 +97,12 @@ int __init sysenter_setup(void)
 }
 
 #ifndef CONFIG_COMPAT_VDSO
-static struct page *syscall_nopage(struct vm_area_struct *vma,
-				unsigned long adr, int *type)
-{
-	struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
-	get_page(p);
-	return p;
-}
-
-/* Prevent VMA merging */
-static void syscall_vma_close(struct vm_area_struct *vma)
-{
-}
-
-static struct vm_operations_struct syscall_vm_ops = {
-	.close = syscall_vma_close,
-	.nopage = syscall_nopage,
-};
-
 /* Defined in vsyscall-sysenter.S */
 extern void SYSENTER_RETURN;
 
 /* Setup a VMA at program startup for the vsyscall page */
 int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 {
-	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
 	int ret;
@@ -132,38 +114,25 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 		goto up_fail;
 	}
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!vma) {
-		ret = -ENOMEM;
-		goto up_fail;
-	}
-
-	vma->vm_start = addr;
-	vma->vm_end = addr + PAGE_SIZE;
-	/* MAYWRITE to allow gdb to COW and set breakpoints */
-	vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
 	/*
+	 * MAYWRITE to allow gdb to COW and set breakpoints
+	 *
 	 * Make sure the vDSO gets into every core dump.
 	 * Dumping its contents makes post-mortem fully interpretable later
 	 * without matching up the same kernel and hardware config to see
 	 * what PC values meant.
 	 */
-	vma->vm_flags |= VM_ALWAYSDUMP;
-	vma->vm_flags |= mm->def_flags;
-	vma->vm_page_prot = protection_map[vma->vm_flags & 7];
-	vma->vm_ops = &syscall_vm_ops;
-	vma->vm_mm = mm;
-
-	ret = insert_vm_struct(mm, vma);
-	if (unlikely(ret)) {
-		kmem_cache_free(vm_area_cachep, vma);
+	ret = install_special_mapping(mm, addr, PAGE_SIZE,
+				      VM_READ|VM_EXEC|
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+				      VM_ALWAYSDUMP,
+				      syscall_pages);
+	if (ret)
 		goto up_fail;
-	}
 
 	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
 				    (void *)VDSO_SYM(&SYSENTER_RETURN);
-	mm->total_vm++;
 up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c0990..a5350059557a 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 #ifdef CONFIG_SMP
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+	    in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->ebp + 4);
 #else
-		unsigned long *sp;
-		if ((regs->xcs & 3) == 0)
-			sp = (unsigned long *)&regs->esp;
-		else
-			sp = (unsigned long *)regs->esp;
+		unsigned long *sp = (unsigned long *)&regs->esp;
+
 		/* Return address is either directly at stack pointer
 		   or above a saved eflags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
@@ -161,15 +159,6 @@ EXPORT_SYMBOL(profile_pc);
  */
 irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_ack) {
 		/*
@@ -188,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 
 	do_timer_interrupt_hook();
 
-
 	if (MCA_bus) {
 		/* The PS/2 uses level-triggered interrupts.  You can't
 		turn them off, nor would you want to (any attempt to
@@ -203,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 		outb_p( irq_v|0x80, 0x61 );	/* reset the IRQ */
 	}
 
-	write_sequnlock(&xtime_lock);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (using_apic_timer)
-		smp_send_timer_broadcast_ipi();
-#endif
-
 	return IRQ_HANDLED;
 }
 
 /* not static: needed by APM */
-unsigned long get_cmos_time(void)
+unsigned long read_persistent_clock(void)
 {
 	unsigned long retval;
 	unsigned long flags;
@@ -227,11 +208,11 @@ unsigned long get_cmos_time(void)
 
 	return retval;
 }
-EXPORT_SYMBOL(get_cmos_time);
 
 static void sync_cmos_clock(unsigned long dummy);
 
 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+int no_sync_cmos_clock;
 
 static void sync_cmos_clock(unsigned long dummy)
 {
@@ -275,117 +256,20 @@ static void sync_cmos_clock(unsigned long dummy)
 
 void notify_arch_cmos_timer(void)
 {
-	mod_timer(&sync_cmos_timer, jiffies + 1);
-}
-
-static long clock_cmos_diff;
-static unsigned long sleep_start;
-
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	unsigned long ctime =  get_cmos_time();
-
-	clock_cmos_diff = -ctime;
-	clock_cmos_diff += get_seconds();
-	sleep_start = ctime;
-	return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long ctime = get_cmos_time();
-	long sleep_length = (ctime - sleep_start) * HZ;
-	struct timespec ts;
-
-	if (sleep_length < 0) {
-		printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
-		/* The time after the resume must not be earlier than the time
-		 * before the suspend or some nasty things will happen
-		 */
-		sleep_length = 0;
-		ctime = sleep_start;
-	}
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled())
-		hpet_reenable();
-#endif
-	setup_pit_timer();
-
-	sec = ctime + clock_cmos_diff;
-	ts.tv_sec = sec;
-	ts.tv_nsec = 0;
-	do_settimeofday(&ts);
-	write_seqlock_irqsave(&xtime_lock, flags);
-	jiffies_64 += sleep_length;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	touch_softlockup_watchdog();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	.resume = timer_resume,
-	.suspend = timer_suspend,
-	set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
+	if (!no_sync_cmos_clock)
+		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-device_initcall(time_init_device);
-
-#ifdef CONFIG_HPET_TIMER
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	struct timespec ts;
-	ts.tv_sec = get_cmos_time();
-	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-
-	do_settimeofday(&ts);
-
-	if ((hpet_enable() >= 0) && hpet_use_timer) {
-		printk("Using HPET for base-timer\n");
-	}
-
+	if (!hpet_enable())
+		setup_pit_timer();
 	do_time_init();
 }
-#endif
 
 void __init time_init(void)
 {
-	struct timespec ts;
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_capable()) {
-		/*
-		 * HPET initialization needs to do memory-mapped io. So, let
-		 * us do a late initialization after mem_init().
-		 */
-		late_time_init = hpet_time_init;
-		return;
-	}
-#endif
-	ts.tv_sec = get_cmos_time();
-	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-
-	do_settimeofday(&ts);
-
-	do_time_init();
+	late_time_init = hpet_time_init;
 }
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
deleted file mode 100644
index 1e4702dfcd01..000000000000
--- a/arch/i386/kernel/time_hpet.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- *  linux/arch/i386/kernel/time_hpet.c
- *  This code largely copied from arch/x86_64/kernel/time.c
- *  See that file for credits.
- *
- *  2003-06-30    Venkatesh Pallipadi - Additional changes for HPET support
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/timer.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-#include <linux/hpet.h>
-
-static unsigned long hpet_period;	/* fsecs / HPET clock */
-unsigned long hpet_tick;		/* hpet clks count per tick */
-unsigned long hpet_address;		/* hpet memory map physical address */
-int hpet_use_timer;
-
-static int use_hpet; 		/* can be used for runtime check of hpet */
-static int boot_hpet_disable; 	/* boottime override for HPET timer */
-static void __iomem * hpet_virt_address;	/* hpet kernel virtual address */
-
-#define FSEC_TO_USEC (1000000000UL)
-
-int hpet_readl(unsigned long a)
-{
-	return readl(hpet_virt_address + a);
-}
-
-static void hpet_writel(unsigned long d, unsigned long a)
-{
-	writel(d, hpet_virt_address + a);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * HPET counters dont wrap around on every tick. They just change the
- * comparator value and continue. Next tick can be caught by checking
- * for a change in the comparator value. Used in apic.c.
- */
-static void __devinit wait_hpet_tick(void)
-{
-	unsigned int start_cmp_val, end_cmp_val;
-
-	start_cmp_val = hpet_readl(HPET_T0_CMP);
-	do {
-		end_cmp_val = hpet_readl(HPET_T0_CMP);
-	} while (start_cmp_val == end_cmp_val);
-}
-#endif
-
-static int hpet_timer_stop_set_go(unsigned long tick)
-{
-	unsigned int cfg;
-
-	/*
-	 * Stop the timers and reset the main counter.
-	 */
-	cfg = hpet_readl(HPET_CFG);
-	cfg &= ~HPET_CFG_ENABLE;
-	hpet_writel(cfg, HPET_CFG);
-	hpet_writel(0, HPET_COUNTER);
-	hpet_writel(0, HPET_COUNTER + 4);
-
-	if (hpet_use_timer) {
-		/*
-		 * Set up timer 0, as periodic with first interrupt to happen at
-		 * hpet_tick, and period also hpet_tick.
-		 */
-		cfg = hpet_readl(HPET_T0_CFG);
-		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_T0_CFG);
-
-		/*
-		 * The first write after writing TN_SETVAL to the config register sets
-		 * the counter value, the second write sets the threshold.
-		 */
-		hpet_writel(tick, HPET_T0_CMP);
-		hpet_writel(tick, HPET_T0_CMP);
-	}
-	/*
- 	 * Go!
- 	 */
-	cfg = hpet_readl(HPET_CFG);
-	if (hpet_use_timer)
-		cfg |= HPET_CFG_LEGACY;
-	cfg |= HPET_CFG_ENABLE;
-	hpet_writel(cfg, HPET_CFG);
-
-	return 0;
-}
-
-/*
- * Check whether HPET was found by ACPI boot parse. If yes setup HPET
- * counter 0 for kernel base timer.
- */
-int __init hpet_enable(void)
-{
-	unsigned int id;
-	unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */
-	unsigned long hpet_tick_rem;
-
-	if (boot_hpet_disable)
-		return -1;
-
-	if (!hpet_address) {
-		return -1;
-	}
-	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-	/*
-	 * Read the period, compute tick and quotient.
-	 */
-	id = hpet_readl(HPET_ID);
-
-	/*
-	 * We are checking for value '1' or more in number field if
-	 * CONFIG_HPET_EMULATE_RTC is set because we will need an
-	 * additional timer for RTC emulation.
-	 * However, we can do with one timer otherwise using the
-	 * the single HPET timer for system time.
-	 */
-#ifdef CONFIG_HPET_EMULATE_RTC
-	if (!(id & HPET_ID_NUMBER)) {
-		iounmap(hpet_virt_address);
-		hpet_virt_address = NULL;
-		return -1;
-	}
-#endif
-
-
-	hpet_period = hpet_readl(HPET_PERIOD);
-	if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
-		iounmap(hpet_virt_address);
-		hpet_virt_address = NULL;
-		return -1;
-	}
-
-	/*
-	 * 64 bit math
-	 * First changing tick into fsec
-	 * Then 64 bit div to find number of hpet clk per tick
-	 */
-	ASM_MUL64_REG(tick_fsec_low, tick_fsec_high,
-			KERNEL_TICK_USEC, FSEC_TO_USEC);
-	ASM_DIV64_REG(hpet_tick, hpet_tick_rem,
-			hpet_period, tick_fsec_low, tick_fsec_high);
-
-	if (hpet_tick_rem > (hpet_period >> 1))
-		hpet_tick++; /* rounding the result */
-
-	hpet_use_timer = id & HPET_ID_LEGSUP;
-
-	if (hpet_timer_stop_set_go(hpet_tick)) {
-		iounmap(hpet_virt_address);
-		hpet_virt_address = NULL;
-		return -1;
-	}
-
-	use_hpet = 1;
-
-#ifdef	CONFIG_HPET
-	{
-		struct hpet_data	hd;
-		unsigned int 		ntimer;
-
-		memset(&hd, 0, sizeof (hd));
-
-		ntimer = hpet_readl(HPET_ID);
-		ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
-		ntimer++;
-
-		/*
-		 * Register with driver.
-		 * Timer0 and Timer1 is used by platform.
-		 */
-		hd.hd_phys_address = hpet_address;
-		hd.hd_address = hpet_virt_address;
-		hd.hd_nirqs = ntimer;
-		hd.hd_flags = HPET_DATA_PLATFORM;
-		hpet_reserve_timer(&hd, 0);
-#ifdef	CONFIG_HPET_EMULATE_RTC
-		hpet_reserve_timer(&hd, 1);
-#endif
-		hd.hd_irq[0] = HPET_LEGACY_8254;
-		hd.hd_irq[1] = HPET_LEGACY_RTC;
-		if (ntimer > 2) {
-			struct hpet __iomem	*hpet;
-			struct hpet_timer __iomem *timer;
-			int			i;
-
-			hpet = hpet_virt_address;
-
-			for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
-				timer++, i++)
-				hd.hd_irq[i] = (timer->hpet_config &
-					Tn_INT_ROUTE_CNF_MASK) >>
-					Tn_INT_ROUTE_CNF_SHIFT;
-
-		}
-
-		hpet_alloc(&hd);
-	}
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (hpet_use_timer)
-		wait_timer_tick = wait_hpet_tick;
-#endif
-	return 0;
-}
-
-int hpet_reenable(void)
-{
-	return hpet_timer_stop_set_go(hpet_tick);
-}
-
-int is_hpet_enabled(void)
-{
-	return use_hpet;
-}
-
-int is_hpet_capable(void)
-{
-	if (!boot_hpet_disable && hpet_address)
-		return 1;
-	return 0;
-}
-
-static int __init hpet_setup(char* str)
-{
-	if (str) {
-		if (!strncmp("disable", str, 7))
-			boot_hpet_disable = 1;
-	}
-	return 1;
-}
-
-__setup("hpet=", hpet_setup);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- *    is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/mc146818rtc.h>
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ 	64
-#define RTC_NUM_INTS 		1
-
-static unsigned long UIE_on;
-static unsigned long prev_update_sec;
-
-static unsigned long AIE_on;
-static struct rtc_time alarm_time;
-
-static unsigned long PIE_on;
-static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
-static unsigned long PIE_count;
-
-static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
-static unsigned int hpet_t1_cmp; /* cached comparator register */
-
-/*
- * Timer 1 for RTC, we do not use periodic interrupt feature,
- * even if HPET supports periodic interrupts on Timer 1.
- * The reason being, to set up a periodic interrupt in HPET, we need to
- * stop the main counter. And if we do that everytime someone diables/enables
- * RTC, we will have adverse effect on main kernel timer running on Timer 0.
- * So, for the time being, simulate the periodic interrupt in software.
- *
- * hpet_rtc_timer_init() is called for the first time and during subsequent
- * interuppts reinit happens through hpet_rtc_timer_reinit().
- */
-int hpet_rtc_timer_init(void)
-{
-	unsigned int cfg, cnt;
-	unsigned long flags;
-
-	if (!is_hpet_enabled())
-		return 0;
-	/*
-	 * Set the counter 1 and enable the interrupts.
-	 */
-	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-		hpet_rtc_int_freq = PIE_freq;
-	else
-		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-	local_irq_save(flags);
-
-	cnt = hpet_readl(HPET_COUNTER);
-	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
-
-	cfg = hpet_readl(HPET_T1_CFG);
-	cfg &= ~HPET_TN_PERIODIC;
-	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-	hpet_writel(cfg, HPET_T1_CFG);
-
-	local_irq_restore(flags);
-
-	return 1;
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
-	unsigned int cfg, cnt, ticks_per_int, lost_ints;
-
-	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
-		cfg = hpet_readl(HPET_T1_CFG);
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_T1_CFG);
-		return;
-	}
-
-	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-		hpet_rtc_int_freq = PIE_freq;
-	else
-		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-	/* It is more accurate to use the comparator value than current count.*/
-	ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
-	hpet_t1_cmp += ticks_per_int;
-	hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
-	/*
-	 * If the interrupt handler was delayed too long, the write above tries
-	 * to schedule the next interrupt in the past and the hardware would
-	 * not interrupt until the counter had wrapped around.
-	 * So we have to check that the comparator wasn't set to a past time.
-	 */
-	cnt = hpet_readl(HPET_COUNTER);
-	if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
-		lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
-		/* Make sure that, even with the time needed to execute
-		 * this code, the next scheduled interrupt has been moved
-		 * back to the future: */
-		lost_ints++;
-
-		hpet_t1_cmp += lost_ints * ticks_per_int;
-		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
-		if (PIE_on)
-			PIE_count += lost_ints;
-
-		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
-		       hpet_rtc_int_freq);
-	}
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	if (bit_mask & RTC_UIE)
-		UIE_on = 0;
-	if (bit_mask & RTC_PIE)
-		PIE_on = 0;
-	if (bit_mask & RTC_AIE)
-		AIE_on = 0;
-
-	return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
-	int timer_init_reqd = 0;
-
-	if (!is_hpet_enabled())
-		return 0;
-
-	if (!(PIE_on | AIE_on | UIE_on))
-		timer_init_reqd = 1;
-
-	if (bit_mask & RTC_UIE) {
-		UIE_on = 1;
-	}
-	if (bit_mask & RTC_PIE) {
-		PIE_on = 1;
-		PIE_count = 0;
-	}
-	if (bit_mask & RTC_AIE) {
-		AIE_on = 1;
-	}
-
-	if (timer_init_reqd)
-		hpet_rtc_timer_init();
-
-	return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	alarm_time.tm_hour = hrs;
-	alarm_time.tm_min = min;
-	alarm_time.tm_sec = sec;
-
-	return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	PIE_freq = freq;
-	PIE_count = 0;
-
-	return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	return 1;
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
-{
-	struct rtc_time curr_time;
-	unsigned long rtc_int_flag = 0;
-	int call_rtc_interrupt = 0;
-
-	hpet_rtc_timer_reinit();
-
-	if (UIE_on | AIE_on) {
-		rtc_get_rtc_time(&curr_time);
-	}
-	if (UIE_on) {
-		if (curr_time.tm_sec != prev_update_sec) {
-			/* Set update int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag = RTC_UF;
-			prev_update_sec = curr_time.tm_sec;
-		}
-	}
-	if (PIE_on) {
-		PIE_count++;
-		if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
-			/* Set periodic int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag |= RTC_PF;
-			PIE_count = 0;
-		}
-	}
-	if (AIE_on) {
-		if ((curr_time.tm_sec == alarm_time.tm_sec) &&
-		    (curr_time.tm_min == alarm_time.tm_min) &&
-		    (curr_time.tm_hour == alarm_time.tm_hour)) {
-			/* Set alarm int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag |= RTC_AF;
-		}
-	}
-	if (call_rtc_interrupt) {
-		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-		rtc_interrupt(rtc_int_flag, dev_id);
-	}
-	return IRQ_HANDLED;
-}
-#endif
-
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 79cf608e14ca..45782356a618 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -1,5 +1,5 @@
 /*
- * arch/i386/kernel/topology.c - Populate driverfs with topology information
+ * arch/i386/kernel/topology.c - Populate sysfs with topology information
  *
  * Written by: Matthew Dobson, IBM Corporation
  * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 0efad8aeb41a..af0d3f70a817 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
 ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
@@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs)
 	int i;
 	int in_kernel = 1;
 	unsigned long esp;
-	unsigned short ss;
+	unsigned short ss, gs;
 
 	esp = (unsigned long) (&regs->esp);
 	savesegment(ss, ss);
+	savesegment(gs, gs);
 	if (user_mode_vm(regs)) {
 		in_kernel = 0;
 		esp = regs->esp;
@@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs)
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
+	printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
+	       regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
 		TASK_COMM_LEN, current->comm, current->pid,
 		current_thread_info(), current, current->thread_info);
@@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		u8 *eip;
-		int code_bytes = 64;
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
 		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
@@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs)
 
 		printk(KERN_EMERG "Code: ");
 
-		eip = (u8 *)regs->eip - 43;
+		eip = (u8 *)regs->eip - code_prologue;
 		if (eip < (u8 *)PAGE_OFFSET ||
 			probe_kernel_address(eip, c)) {
 			/* try starting at EIP */
 			eip = (u8 *)regs->eip;
-			code_bytes = 32;
+			code_len = code_len - code_prologue + 1;
 		}
-		for (i = 0; i < code_bytes; i++, eip++) {
+		for (i = 0; i < code_len; i++, eip++) {
 			if (eip < (u8 *)PAGE_OFFSET ||
 				probe_kernel_address(eip, c)) {
 				printk(" Bad EIP value.");
@@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s)
 	return 1;
 }
 __setup("kstack=", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b925..3082a418635c 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
  * an extra value to store the TSC freq
  */
 unsigned int tsc_khz;
+unsigned long long (*custom_sched_clock)(void);
 
 int tsc_disable;
 
@@ -59,12 +60,6 @@ static inline int check_tsc_unstable(void)
 	return tsc_unstable;
 }
 
-void mark_tsc_unstable(void)
-{
-	tsc_unstable = 1;
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
 /* Accellerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
@@ -107,14 +102,14 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long this_offset;
 
+	if (unlikely(custom_sched_clock))
+		return (*custom_sched_clock)();
+
 	/*
-	 * in the NUMA case we dont use the TSC as they are not
-	 * synchronized across all CPUs.
+	 * Fall back to jiffies if there's no TSC available:
 	 */
-#ifndef CONFIG_NUMA
-	if (!cpu_khz || check_tsc_unstable())
-#endif
-		/* no locking but a rare wrong value is not a big deal */
+	if (unlikely(tsc_disable))
+		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
 	/* read the Time Stamp Counter: */
@@ -194,13 +189,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
 void __init tsc_init(void)
 {
 	if (!cpu_has_tsc || tsc_disable)
-		return;
+		goto out_no_tsc;
 
 	cpu_khz = calculate_cpu_khz();
 	tsc_khz = cpu_khz;
 
 	if (!cpu_khz)
-		return;
+		goto out_no_tsc;
 
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
@@ -208,37 +203,18 @@ void __init tsc_init(void)
 
 	set_cyc2ns_scale(cpu_khz);
 	use_tsc_delay();
-}
+	return;
 
-#ifdef CONFIG_CPU_FREQ
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(struct work_struct *work)
-{
-	unsigned int cpu;
-
-	for_each_online_cpu(cpu)
-		cpufreq_get(cpu);
-
-	cpufreq_delayed_issched = 0;
+out_no_tsc:
+	/*
+	 * Set the tsc_disable flag if there's no TSC support, this
+	 * makes it a fast flag for the kernel to see whether it
+	 * should be using the TSC.
+	 */
+	tsc_disable = 1;
 }
 
-/*
- * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void)
-{
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
+#ifdef CONFIG_CPU_FREQ
 
 /*
  * if the CPU frequency is scaled, TSC-based delays will need a different
@@ -303,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
 
 static int __init cpufreq_tsc(void)
 {
-	int ret;
-
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
-	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
-					CPUFREQ_TRANSITION_NOTIFIER);
-	if (!ret)
-		cpufreq_init = 1;
-
-	return ret;
+	return cpufreq_register_notifier(&time_cpufreq_notifier_block,
+					 CPUFREQ_TRANSITION_NOTIFIER);
 }
-
 core_initcall(cpufreq_tsc);
 
 #endif
@@ -321,7 +289,6 @@ core_initcall(cpufreq_tsc);
 /* clock source code */
 
 static unsigned long current_tsc_khz = 0;
-static int tsc_update_callback(void);
 
 static cycle_t read_tsc(void)
 {
@@ -339,37 +306,28 @@ static struct clocksource clocksource_tsc = {
 	.mask			= CLOCKSOURCE_MASK(64),
 	.mult			= 0, /* to be set */
 	.shift			= 22,
-	.update_callback	= tsc_update_callback,
-	.is_continuous		= 1,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
+				  CLOCK_SOURCE_MUST_VERIFY,
 };
 
-static int tsc_update_callback(void)
+void mark_tsc_unstable(void)
 {
-	int change = 0;
-
-	/* check to see if we should switch to the safe clocksource: */
-	if (clocksource_tsc.rating != 0 && check_tsc_unstable()) {
-		clocksource_tsc.rating = 0;
-		clocksource_reselect();
-		change = 1;
-	}
-
-	/* only update if tsc_khz has changed: */
-	if (current_tsc_khz != tsc_khz) {
-		current_tsc_khz = tsc_khz;
-		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
-							clocksource_tsc.shift);
-		change = 1;
+	if (!tsc_unstable) {
+		tsc_unstable = 1;
+		/* Can be called before registration */
+		if (clocksource_tsc.mult)
+			clocksource_change_rating(&clocksource_tsc, 0);
+		else
+			clocksource_tsc.rating = 0;
 	}
-
-	return change;
 }
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
 {
 	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
 		       d->ident);
-	mark_tsc_unstable();
+	tsc_unstable = 1;
 	return 0;
 }
 
@@ -386,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
 	 {}
 };
 
-#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
-static struct timer_list verify_tsc_freq_timer;
-
-/* XXX - Probably should add locking */
-static void verify_tsc_freq(unsigned long unused)
-{
-	static u64 last_tsc;
-	static unsigned long last_jiffies;
-
-	u64 now_tsc, interval_tsc;
-	unsigned long now_jiffies, interval_jiffies;
-
-
-	if (check_tsc_unstable())
-		return;
-
-	rdtscll(now_tsc);
-	now_jiffies = jiffies;
-
-	if (!last_jiffies) {
-		goto out;
-	}
-
-	interval_jiffies = now_jiffies - last_jiffies;
-	interval_tsc = now_tsc - last_tsc;
-	interval_tsc *= HZ;
-	do_div(interval_tsc, cpu_khz*1000);
-
-	if (interval_tsc < (interval_jiffies * 3 / 4)) {
-		printk("TSC appears to be running slowly. "
-			"Marking it as unstable\n");
-		mark_tsc_unstable();
-		return;
-	}
-
-out:
-	last_tsc = now_tsc;
-	last_jiffies = now_jiffies;
-	/* set us up to go off on the next interval: */
-	mod_timer(&verify_tsc_freq_timer,
-		jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
-}
-
 /*
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
-static __init int unsynchronized_tsc(void)
+__cpuinit int unsynchronized_tsc(void)
 {
+	if (!cpu_has_tsc || tsc_unstable)
+		return 1;
 	/*
 	 * Intel systems are normally all synchronized.
 	 * Exceptions must mark TSC as unstable:
 	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- 		return 0;
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+		/* assume multi socket systems are not synchronized: */
+		if (num_possible_cpus() > 1)
+			tsc_unstable = 1;
+	}
+	return tsc_unstable;
+}
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+	unsigned long val;
 
-	/* assume multi socket systems are not synchronized: */
- 	return num_possible_cpus() > 1;
+	rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
+	if ((val & RTSC_SUSP))
+		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
 }
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
 
 static int __init init_tsc_clocksource(void)
 {
@@ -453,20 +390,16 @@ static int __init init_tsc_clocksource(void)
 		/* check blacklist */
 		dmi_check_system(bad_tsc_dmi_table);
 
-		if (unsynchronized_tsc()) /* mark unstable if unsynced */
-			mark_tsc_unstable();
+		unsynchronized_tsc();
+		check_geode_tsc_reliable();
 		current_tsc_khz = tsc_khz;
 		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
 							clocksource_tsc.shift);
 		/* lower the rating if we already know its unstable: */
-		if (check_tsc_unstable())
+		if (check_tsc_unstable()) {
 			clocksource_tsc.rating = 0;
-
-		init_timer(&verify_tsc_freq_timer);
-		verify_tsc_freq_timer.function = verify_tsc_freq;
-		verify_tsc_freq_timer.expires =
-			jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
-		add_timer(&verify_tsc_freq_timer);
+			clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+		}
 
 		return clocksource_register(&clocksource_tsc);
 	}
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index be2f96e67f78..d1b8f2b7aea6 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
 {
 	int ret = 0;
 
-	/* kernel_vm86_regs is missing xfs, so copy everything up to
-	   (but not including) xgs, and then rest after xgs. */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
-	ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+	/* kernel_vm86_regs is missing xgs, so copy everything up to
+	   (but not including) orig_eax, and then rest including orig_eax. */
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
 			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.xgs));
+			    offsetof(struct kernel_vm86_regs, pt.orig_eax));
 
 	return ret;
 }
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 {
 	int ret = 0;
 
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
-	ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+	/* copy eax-xfs inclusive */
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	/* copy orig_eax-__gsh+extra */
+	ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
 			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.xgs) +
+			      offsetof(struct kernel_vm86_regs, pt.orig_eax) +
 			      extra);
-
 	return ret;
 }
 
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 
 	ret = KVM86->regs32;
 
-	loadsegment(fs, current->thread.saved_fs);
-	ret->xgs = current->thread.saved_gs;
+	ret->xfs = current->thread.saved_fs;
+	loadsegment(gs, current->thread.saved_gs);
 
 	return ret;
 }
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs.pt.xds = 0;
 	info->regs.pt.xes = 0;
-	info->regs.pt.xgs = 0;
+	info->regs.pt.xfs = 0;
 
-/* we are clearing fs later just before "jmp resume_userspace",
+/* we are clearing gs later just before "jmp resume_userspace",
  * because it is not saved/restored.
  */
 
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs32->eax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	savesegment(fs, tsk->thread.saved_fs);
-	tsk->thread.saved_gs = info->regs32->xgs;
+	tsk->thread.saved_fs = info->regs32->xfs;
+	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
 	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
-		"mov  %2, %%fs\n\t"
+		"mov  %2, %%gs\n\t"
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 000000000000..bb5a7abf949c
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,949 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/license.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/vmi_time.h>
+
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+
+#define call_vrom_func(rom,func) \
+   (((VROMFUNC *)(rom->func))())
+
+#define call_vrom_long_func(rom,func,arg) \
+   (((VROMLONGFUNC *)(rom->func)) (arg))
+
+static struct vrom_header *vmi_rom;
+static int license_gplok;
+static int disable_nodelay;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+
+/* Cached VMI operations */
+struct {
+	void (*cpuid)(void /* non-c */);
+	void (*_set_ldt)(u32 selector);
+	void (*set_tr)(u32 selector);
+	void (*set_kernel_stack)(u32 selector, u32 esp0);
+	void (*allocate_page)(u32, u32, u32, u32, u32);
+	void (*release_page)(u32, u32);
+	void (*set_pte)(pte_t, pte_t *, unsigned);
+	void (*update_pte)(pte_t *, unsigned);
+	void (*set_linear_mapping)(int, u32, u32, u32);
+	void (*flush_tlb)(int);
+	void (*set_initial_ap_state)(int, int);
+	void (*halt)(void);
+} vmi_ops;
+
+/* XXX move this to alternative.h */
+extern struct paravirt_patch __start_parainstructions[],
+	__stop_parainstructions[];
+
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP  0xe9
+#define MNEM_RET  0xc3
+
+static char irq_save_disable_callout[] = {
+	MNEM_CALL, 0, 0, 0, 0,
+	MNEM_CALL, 0, 0, 0, 0,
+	MNEM_RET
+};
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE  5
+
+static inline void patch_offset(unsigned char *eip, unsigned char *dest)
+{
+        *(unsigned long *)(eip+1) = dest-eip-5;
+}
+
+static unsigned patch_internal(int call, unsigned len, void *insns)
+{
+	u64 reloc;
+	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
+	switch(rel->type) {
+		case VMI_RELOCATION_CALL_REL:
+			BUG_ON(len < 5);
+			*(char *)insns = MNEM_CALL;
+			patch_offset(insns, rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_JUMP_REL:
+			BUG_ON(len < 5);
+			*(char *)insns = MNEM_JMP;
+			patch_offset(insns, rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_NOP:
+			/* obliterate the whole thing */
+			return 0;
+
+		case VMI_RELOCATION_NONE:
+			/* leave native code in place */
+			break;
+
+		default:
+			BUG();
+	}
+	return len;
+}
+
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence.  The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	switch (type) {
+		case PARAVIRT_IRQ_DISABLE:
+			return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
+		case PARAVIRT_IRQ_ENABLE:
+			return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
+		case PARAVIRT_RESTORE_FLAGS:
+			return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
+		case PARAVIRT_SAVE_FLAGS:
+			return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+        	case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
+			if (len >= 10) {
+				patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+				patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
+				return 10;
+			} else {
+				/*
+				 * You bastards didn't leave enough room to
+				 * patch save_flags_irq_disable inline.  Patch
+				 * to a helper
+				 */
+				BUG_ON(len < 5);
+				*(char *)insns = MNEM_CALL;
+				patch_offset(insns, irq_save_disable_callout);
+				return 5;
+			}
+		case PARAVIRT_INTERRUPT_RETURN:
+			return patch_internal(VMI_CALL_IRET, len, insns);
+		case PARAVIRT_STI_SYSEXIT:
+			return patch_internal(VMI_CALL_SYSEXIT, len, insns);
+		default:
+			break;
+	}
+	return len;
+}
+
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+	int override = 0;
+	if (*eax == 1)
+		override = 1;
+        asm volatile ("call *%6"
+                      : "=a" (*eax),
+                        "=b" (*ebx),
+                        "=c" (*ecx),
+                        "=d" (*edx)
+                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+	if (override) {
+		if (disable_pse)
+			*edx &= ~X86_FEATURE_PSE;
+		if (disable_pge)
+			*edx &= ~X86_FEATURE_PGE;
+		if (disable_sep)
+			*edx &= ~X86_FEATURE_SEP;
+		if (disable_tsc)
+			*edx &= ~X86_FEATURE_TSC;
+		if (disable_mtrr)
+			*edx &= ~X86_FEATURE_MTRR;
+	}
+}
+
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+		write_gdt_entry(gdt, nr, new->a, new->b);
+}
+
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned cpu = smp_processor_id();
+	u32 low, high;
+
+	pack_descriptor(&low, &high, (unsigned long)addr,
+			entries * sizeof(struct desc_struct) - 1,
+			DESCTYPE_LDT, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+
+static void vmi_set_tr(void)
+{
+	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+
+static void vmi_load_esp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
+}
+
+static void vmi_flush_tlb_user(void)
+{
+	vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+}
+
+static void vmi_flush_tlb_kernel(void)
+{
+	vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+
+/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
+#ifdef CONFIG_NO_IDLE_HZ
+static fastcall void vmi_safe_halt(void)
+{
+	int idle = vmi_stop_hz_timer();
+	vmi_ops.halt();
+	if (idle) {
+		local_irq_disable();
+		vmi_account_time_restart_hz_timer();
+		local_irq_enable();
+	}
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+	u32 pfn;
+	int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+
+void vmi_apply_boot_page_allocations(void)
+{
+	int i;
+	BUG_ON(!mem_map);
+	for (i = 0; i < num_boot_page_allocations; i++) {
+		struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+		page->type = boot_page_allocations[i].type;
+		page->type = boot_page_allocations[i].type &
+				~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	}
+	boot_allocations_applied = 1;
+}
+
+static void record_page_type(u32 pfn, int type)
+{
+	BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+	boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+	boot_page_allocations[num_boot_page_allocations].type = type;
+	num_boot_page_allocations++;
+}
+
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+	u32 *ptr;
+	int i;
+	int limit = PAGE_SIZE / sizeof(int);
+
+	if (page_address(page))
+		ptr = (u32 *)page_address(page);
+	else
+		ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+	/*
+	 * When cloning the root in non-PAE mode, only the userspace
+	 * pdes need to be zeroed.
+	 */
+	if (type & VMI_PAGE_CLONE)
+		limit = USER_PTRS_PER_PGD;
+	for (i = 0; i < limit; i++)
+		BUG_ON(ptr[i]);
+}
+
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - don't track */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		if (type != VMI_PAGE_NORMAL)
+			BUG_ON(page->type);
+		else
+			BUG_ON(page->type == VMI_PAGE_NORMAL);
+		page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+		if (type & VMI_PAGE_ZEROED)
+			check_zeroed_page(pfn, type, page);
+	} else {
+		record_page_type(pfn, type);
+	}
+}
+
+static void vmi_check_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - skip checks */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+		BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+		BUG_ON((type & page->type) == 0);
+	}
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+
+static void vmi_allocate_pt(u32 pfn)
+{
+	vmi_set_page_type(pfn, VMI_PAGE_L1);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+
+static void vmi_allocate_pd(u32 pfn)
+{
+ 	/*
+	 * This call comes in very early, before mem_map is setup.
+	 * It is called only for swapper_pg_dir, which already has
+	 * data on it.
+	 */
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+	vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+
+static void vmi_release_pt(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L1);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+static void vmi_release_pd(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L2);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * Helper macros for MMU update flags.  We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush).  We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs.  We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
+                                       (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user)                           \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user)                     \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+
+static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+	const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+	const pte_t pte = { pmdval.pud.pgd.pgd };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+
+#ifdef CONFIG_X86_PAE
+
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	/*
+	 * XXX This is called from set_pmd_pte, but at both PT
+	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
+	 * it is only called for large page mapping changes,
+	 * the Xen backend, doesn't support large pages, and the
+	 * ESX backend doesn't depend on the flag.
+	 */
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+	/* Um, eww */
+	const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+	vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+void vmi_pmd_clear(pmd_t *pmd)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+
+#ifdef CONFIG_SMP
+struct vmi_ap_state ap;
+extern void setup_pda(void);
+
+static void __init /* XXX cpu hotplug */
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+		     unsigned long start_esp)
+{
+	/* Default everything to zero.  This is fine for most GPRs. */
+	memset(&ap, 0, sizeof(struct vmi_ap_state));
+
+	ap.gdtr_limit = GDT_SIZE - 1;
+	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+
+	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+	ap.idtr_base = (unsigned long) idt_table;
+
+	ap.ldtr = 0;
+
+	ap.cs = __KERNEL_CS;
+	ap.eip = (unsigned long) start_eip;
+	ap.ss = __KERNEL_DS;
+	ap.esp = (unsigned long) start_esp;
+
+	ap.ds = __USER_DS;
+	ap.es = __USER_DS;
+	ap.fs = __KERNEL_PDA;
+	ap.gs = 0;
+
+	ap.eflags = 0;
+
+	setup_pda();
+
+#ifdef CONFIG_X86_PAE
+	/* efer should match BSP efer. */
+	if (cpu_has_nx) {
+		unsigned l, h;
+		rdmsr(MSR_EFER, l, h);
+		ap.efer = (unsigned long long) h << 32 | l;
+	}
+#endif
+
+	ap.cr3 = __pa(swapper_pg_dir);
+	/* Protected mode, paging, AM, WP, NE, MP. */
+	ap.cr0 = 0x80050023;
+	ap.cr4 = mmu_cr4_features;
+	vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
+}
+#endif
+
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+	struct pci_header *pci;
+	struct pnp_header *pnp;
+	const char *manufacturer = "UNKNOWN";
+	const char *product = "UNKNOWN";
+	const char *license = "unspecified";
+
+	if (rom->rom_signature != 0xaa55)
+		return 0;
+	if (rom->vrom_signature != VMI_SIGNATURE)
+		return 0;
+	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+				rom->api_version_maj,
+				rom->api_version_min);
+		return 0;
+	}
+
+	/*
+	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+	 * the PCI header and device type to make sure this is really a
+	 * VMI device.
+	 */
+	if (!rom->pci_header_offs) {
+		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+		return 0;
+	}
+
+	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+		/* Allow it to run... anyways, but warn */
+		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+	}
+
+	if (rom->pnp_header_offs) {
+		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+		if (pnp->manufacturer_offset)
+			manufacturer = (const char *)rom+pnp->manufacturer_offset;
+		if (pnp->product_offset)
+			product = (const char *)rom+pnp->product_offset;
+	}
+
+	if (rom->license_offs)
+		license = (char *)rom+rom->license_offs;
+
+	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+		manufacturer, product,
+		rom->api_version_maj, rom->api_version_min,
+		pci->rom_version_maj, pci->rom_version_min);
+
+        license_gplok = license_is_gpl_compatible(license);
+        if (!license_gplok) {
+                printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
+		       "inlining disabled\n",
+                       license);
+                add_taint(TAINT_PROPRIETARY_MODULE);
+        }
+	return 1;
+}
+
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+	unsigned long base;
+
+	/* VMI ROM is in option ROM area, check signature */
+	for (base = 0xC0000; base < 0xE0000; base += 2048) {
+		struct vrom_header *romstart;
+		romstart = (struct vrom_header *)isa_bus_to_virt(base);
+		if (check_vmi_rom(romstart)) {
+			vmi_rom = romstart;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+ 	/* We must establish the lowmem mapping for MMU ops to work */
+	if (vmi_rom)
+		vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+}
+
+/*
+ * Return a pointer to the VMI function or a NOP stub
+ */
+static void *vmi_get_function(int vmicall)
+{
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
+	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+	if (rel->type == VMI_RELOCATION_CALL_REL)
+		return (void *)rel->eip;
+	else
+		return (void *)vmi_nop;
+}
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default.
+ */
+#define para_fill(opname, vmicall)				\
+do {								\
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
+				    VMI_CALL_##vmicall);	\
+	if (rel->type != VMI_RELOCATION_NONE) {			\
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);	\
+		paravirt_ops.opname = (void *)rel->eip;		\
+	}							\
+} while (0)
+
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+	short kernel_cs;
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+
+	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+		printk(KERN_ERR "VMI ROM failed to initialize!");
+		return 0;
+	}
+	savesegment(cs, kernel_cs);
+
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+
+	paravirt_ops.patch = vmi_patch;
+	paravirt_ops.name = "vmi";
+
+	/*
+	 * Many of these operations are ABI compatible with VMI.
+	 * This means we can fill in the paravirt-ops with direct
+	 * pointers into the VMI ROM.  If the calling convention for
+	 * these operations changes, this code needs to be updated.
+	 *
+	 * Exceptions
+	 *  CPUID paravirt-op uses pointers, not the native ISA
+	 *  halt has no VMI equivalent; all VMI halts are "safe"
+	 *  no MSR support yet - just trap and emulate.  VMI uses the
+	 *    same ABI as the native ISA, but Linux wants exceptions
+	 *    from bogus MSR read / write handled
+	 *  rdpmc is not yet used in Linux
+	 */
+
+	/* CPUID is special, so very special */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_CPUID);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.cpuid = (void *)rel->eip;
+		paravirt_ops.cpuid = vmi_cpuid;
+	}
+
+	para_fill(clts, CLTS);
+	para_fill(get_debugreg, GetDR);
+	para_fill(set_debugreg, SetDR);
+	para_fill(read_cr0, GetCR0);
+	para_fill(read_cr2, GetCR2);
+	para_fill(read_cr3, GetCR3);
+	para_fill(read_cr4, GetCR4);
+	para_fill(write_cr0, SetCR0);
+	para_fill(write_cr2, SetCR2);
+	para_fill(write_cr3, SetCR3);
+	para_fill(write_cr4, SetCR4);
+	para_fill(save_fl, GetInterruptMask);
+	para_fill(restore_fl, SetInterruptMask);
+	para_fill(irq_disable, DisableInterrupts);
+	para_fill(irq_enable, EnableInterrupts);
+	/* irq_save_disable !!! sheer pain */
+	patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
+		     (char *)paravirt_ops.save_fl);
+	patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
+		     (char *)paravirt_ops.irq_disable);
+#ifndef CONFIG_NO_IDLE_HZ
+	para_fill(safe_halt, Halt);
+#else
+	vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
+	paravirt_ops.safe_halt = vmi_safe_halt;
+#endif
+	para_fill(wbinvd, WBINVD);
+	/* paravirt_ops.read_msr = vmi_rdmsr */
+	/* paravirt_ops.write_msr = vmi_wrmsr */
+	para_fill(read_tsc, RDTSC);
+	/* paravirt_ops.rdpmc = vmi_rdpmc */
+
+	/* TR interface doesn't pass TR value */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_SetTR);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.set_tr = (void *)rel->eip;
+		paravirt_ops.load_tr_desc = vmi_set_tr;
+	}
+
+	/* LDT is special, too */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_SetLDT);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops._set_ldt = (void *)rel->eip;
+		paravirt_ops.set_ldt = vmi_set_ldt;
+	}
+
+	para_fill(load_gdt, SetGDT);
+	para_fill(load_idt, SetIDT);
+	para_fill(store_gdt, GetGDT);
+	para_fill(store_idt, GetIDT);
+	para_fill(store_tr, GetTR);
+	paravirt_ops.load_tls = vmi_load_tls;
+	para_fill(write_ldt_entry, WriteLDTEntry);
+	para_fill(write_gdt_entry, WriteGDTEntry);
+	para_fill(write_idt_entry, WriteIDTEntry);
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,
+				    VMI_CALL_UpdateKernelStack);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.set_kernel_stack = (void *)rel->eip;
+		paravirt_ops.load_esp0 = vmi_load_esp0;
+	}
+
+	para_fill(set_iopl_mask, SetIOPLMask);
+	paravirt_ops.io_delay = (void *)vmi_nop;
+	if (!disable_nodelay) {
+		paravirt_ops.const_udelay = (void *)vmi_nop;
+	}
+
+	para_fill(set_lazy_mode, SetLazyMode);
+
+	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		vmi_ops.flush_tlb = (void *)rel->eip;
+		paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
+		paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
+	}
+	para_fill(flush_tlb_single, InvalPage);
+
+	/*
+	 * Until a standard flag format can be agreed on, we need to
+	 * implement these as wrappers in Linux.  Get the VMI ROM
+	 * function pointers for the two backend calls.
+	 */
+#ifdef CONFIG_X86_PAE
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+
+	paravirt_ops.alloc_pt = vmi_allocate_pt;
+	paravirt_ops.alloc_pd = vmi_allocate_pd;
+	paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+	paravirt_ops.release_pt = vmi_release_pt;
+	paravirt_ops.release_pd = vmi_release_pd;
+	paravirt_ops.set_pte = vmi_set_pte;
+	paravirt_ops.set_pte_at = vmi_set_pte_at;
+	paravirt_ops.set_pmd = vmi_set_pmd;
+	paravirt_ops.pte_update = vmi_update_pte;
+	paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+#ifdef CONFIG_X86_PAE
+	paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+	paravirt_ops.set_pte_present = vmi_set_pte_present;
+	paravirt_ops.set_pud = vmi_set_pud;
+	paravirt_ops.pte_clear = vmi_pte_clear;
+	paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+	/*
+	 * These MUST always be patched.  Don't support indirect jumps
+	 * through these operations, as the VMI interface may use either
+	 * a jump or a call to get to these operations, depending on
+	 * the backend.  They are performance critical anyway, so requiring
+	 * a patch is not a big problem.
+	 */
+	paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	paravirt_ops.iret = (void *)0xbadbab0;
+
+#ifdef CONFIG_SMP
+	paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
+	vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
+	paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
+	paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
+#endif
+
+	/*
+	 * Check for VMI timer functionality by probing for a cycle frequency method
+	 */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+		vmi_timer_ops.get_cycle_counter =
+			vmi_get_function(VMI_CALL_GetCycleCounter);
+		vmi_timer_ops.get_wallclock =
+			vmi_get_function(VMI_CALL_GetWallclockTime);
+		vmi_timer_ops.wallclock_updated =
+			vmi_get_function(VMI_CALL_WallclockUpdated);
+		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+		vmi_timer_ops.cancel_alarm =
+			 vmi_get_function(VMI_CALL_CancelAlarm);
+		paravirt_ops.time_init = vmi_time_init;
+		paravirt_ops.get_wallclock = vmi_get_wallclock;
+		paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+		paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
+		paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+#endif
+		custom_sched_clock = vmi_sched_clock;
+	}
+
+	/*
+	 * Alternative instruction rewriting doesn't happen soon enough
+	 * to convert VMI_IRET to a call instead of a jump; so we have
+	 * to do this before IRQs get reenabled.  Fortunately, it is
+	 * idempotent.
+	 */
+	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+
+	vmi_bringup();
+
+	return 1;
+}
+
+#undef para_fill
+
+void __init vmi_init(void)
+{
+	unsigned long flags;
+
+	if (!vmi_rom)
+		probe_vmi_rom();
+	else
+		check_vmi_rom(vmi_rom);
+
+	/* In case probing for or validating the ROM failed, basil */
+	if (!vmi_rom)
+		return;
+
+	reserve_top_address(-vmi_rom->virtual_top);
+
+	local_irq_save(flags);
+	activate_vmi();
+#ifdef CONFIG_SMP
+	no_timer_check = 1;
+#endif
+	local_irq_restore(flags & X86_EFLAGS_IF);
+}
+
+static int __init parse_vmi(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (!strcmp(arg, "disable_nodelay"))
+		disable_nodelay = 1;
+	else if (!strcmp(arg, "disable_pge")) {
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		disable_pge = 1;
+	} else if (!strcmp(arg, "disable_pse")) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else if (!strcmp(arg, "disable_sep")) {
+		clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+		disable_sep = 1;
+	} else if (!strcmp(arg, "disable_tsc")) {
+		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+		disable_tsc = 1;
+	} else if (!strcmp(arg, "disable_mtrr")) {
+		clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+		disable_mtrr = 1;
+	}
+	return 0;
+}
+
+early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 000000000000..76d2adcae5a3
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,499 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+/*
+ * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
+ * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/rcupdate.h>
+#include <linux/clocksource.h>
+
+#include <asm/timer.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/div64.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+
+#include <mach_timer.h>
+#include <io_ports.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
+#else
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
+#endif
+
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* /proc/sys/kernel/hz_timer state. */
+int sysctl_hz_timer;
+
+/* Some stats */
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
+static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
+static int alarm_hz = CONFIG_VMI_ALARM_HZ;
+
+/* Cache of the value get_cycle_frequency / HZ. */
+static signed long long cycles_per_jiffy;
+
+/* Cache of the value get_cycle_frequency / alarm_hz. */
+static signed long long cycles_per_alarm;
+
+/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
+ * Protected by xtime_lock. */
+static unsigned long long real_cycles_accounted_system;
+
+/* The number of cycles accounted for by update_process_times(), per cpu. */
+static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
+
+/* The number of stolen cycles accounted, per cpu. */
+static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
+
+/* Clock source. */
+static cycle_t read_real_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static cycle_t read_available_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+#if 0
+static cycle_t read_stolen_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
+}
+#endif  /*  0  */
+
+static struct clocksource clocksource_vmi = {
+	.name			= "vmi-timer",
+	.rating			= 450,
+	.read			= read_real_cycles,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+
+/* Timer interrupt handler. */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
+
+static struct irqaction vmi_timer_irq  = {
+	vmi_timer_interrupt,
+	SA_INTERRUPT,
+	CPU_MASK_NONE,
+	"VMI-alarm",
+	NULL,
+	NULL
+};
+
+/* Alarm rate */
+static int __init vmi_timer_alarm_rate_setup(char* str)
+{
+	int alarm_rate;
+	if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
+		alarm_hz = alarm_rate;
+		printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
+	}
+	return 1;
+}
+__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
+
+
+/* Initialization */
+static void vmi_get_wallclock_ts(struct timespec *ts)
+{
+	unsigned long long wallclock;
+	wallclock = vmi_timer_ops.get_wallclock(); // nsec units
+	ts->tv_nsec = do_div(wallclock, 1000000000);
+	ts->tv_sec = wallclock;
+}
+
+static void update_xtime_from_wallclock(void)
+{
+	struct timespec ts;
+	vmi_get_wallclock_ts(&ts);
+	do_settimeofday(&ts);
+}
+
+unsigned long vmi_get_wallclock(void)
+{
+	struct timespec ts;
+	vmi_get_wallclock_ts(&ts);
+	return ts.tv_sec;
+}
+
+int vmi_set_wallclock(unsigned long now)
+{
+	return -1;
+}
+
+unsigned long long vmi_sched_clock(void)
+{
+	return read_available_cycles();
+}
+
+void __init vmi_time_init(void)
+{
+	unsigned long long cycles_per_sec, cycles_per_msec;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	setup_irq(0, &vmi_timer_irq);
+#ifdef CONFIG_X86_LOCAL_APIC
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
+#endif
+
+	no_sync_cmos_clock = 1;
+
+	vmi_get_wallclock_ts(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+		-xtime.tv_sec, -xtime.tv_nsec);
+
+	real_cycles_accounted_system = read_real_cycles();
+	update_xtime_from_wallclock();
+	per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
+
+	cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
+
+	cycles_per_jiffy = cycles_per_sec;
+	(void)do_div(cycles_per_jiffy, HZ);
+	cycles_per_alarm = cycles_per_sec;
+	(void)do_div(cycles_per_alarm, alarm_hz);
+	cycles_per_msec = cycles_per_sec;
+	(void)do_div(cycles_per_msec, 1000);
+	cpu_khz = cycles_per_msec;
+
+	printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
+	       "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
+	       cycles_per_alarm);
+
+	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+						    clocksource_vmi.shift);
+	if (clocksource_register(&clocksource_vmi))
+		printk(KERN_WARNING "Error registering VMITIME clocksource.");
+
+	/* Disable PIT. */
+	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+	/* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
+	 * reduce the latency calling update_process_times. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+		      cycles_per_alarm);
+
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+void __init vmi_timer_setup_boot_alarm(void)
+{
+	local_irq_disable();
+
+	/* Route the interrupt to the correct vector. */
+	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+	/* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
+	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+		      cycles_per_alarm);
+	local_irq_enable();
+}
+
+/* Initialize the time accounting variables for an AP on an SMP system.
+ * Also, set the local alarm for the AP. */
+void __init vmi_timer_setup_secondary_alarm(void)
+{
+	int cpu = smp_processor_id();
+
+	/* Route the interrupt to the correct vector. */
+	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+	per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
+
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+		      cycles_per_alarm);
+}
+
+#endif
+
+/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
+static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
+{
+	long long cycles_not_accounted;
+
+	write_seqlock(&xtime_lock);
+
+	cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		/* systems wide jiffies and wallclock. */
+		do_timer(1);
+
+		cycles_not_accounted -= cycles_per_jiffy;
+		real_cycles_accounted_system += cycles_per_jiffy;
+	}
+
+	if (vmi_timer_ops.wallclock_updated())
+		update_xtime_from_wallclock();
+
+	write_sequnlock(&xtime_lock);
+}
+
+/* Update per-cpu process times. */
+static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
+					     unsigned long long cur_process_times_cycles)
+{
+	long long cycles_not_accounted;
+	cycles_not_accounted = cur_process_times_cycles -
+		per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		/* Account time to the current process.  This includes
+		 * calling into the scheduler to decrement the timeslice
+		 * and possibly reschedule.*/
+		update_process_times(user_mode(regs));
+		/* XXX handle /proc/profile multiplier.  */
+		profile_tick(CPU_PROFILING);
+
+		cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Update per-cpu idle times.  Used when a no-hz halt is ended. */
+static void vmi_account_no_hz_idle_cycles(int cpu,
+					  unsigned long long cur_process_times_cycles)
+{
+	long long cycles_not_accounted;
+	unsigned long no_idle_hz_jiffies = 0;
+
+	cycles_not_accounted = cur_process_times_cycles -
+		per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		no_idle_hz_jiffies++;
+		cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+	/* Account time to the idle process. */
+	account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
+}
+#endif
+
+/* Update per-cpu stolen time. */
+static void vmi_account_stolen_cycles(int cpu,
+				      unsigned long long cur_real_cycles,
+				      unsigned long long cur_avail_cycles)
+{
+	long long stolen_cycles_not_accounted;
+	unsigned long stolen_jiffies = 0;
+
+	if (cur_real_cycles < cur_avail_cycles)
+		return;
+
+	stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
+		per_cpu(stolen_cycles_accounted_cpu, cpu);
+
+	while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
+		stolen_jiffies++;
+		stolen_cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+	/* HACK: pass NULL to force time onto cpustat->steal. */
+	account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
+}
+
+/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
+ * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
+static void vmi_local_timer_interrupt(int cpu)
+{
+	unsigned long long cur_real_cycles, cur_process_times_cycles;
+
+	cur_real_cycles = read_real_cycles();
+	cur_process_times_cycles = read_available_cycles();
+	/* Update system wide (real) time state (xtime, jiffies). */
+	vmi_account_real_cycles(cur_real_cycles);
+	/* Update per-cpu process times. */
+	vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
+        /* Update time stolen from this cpu by the hypervisor. */
+	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* Must be called only from idle loop, with interrupts disabled. */
+int vmi_stop_hz_timer(void)
+{
+	/* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
+
+	unsigned long seq, next;
+	unsigned long long real_cycles_expiry;
+	int cpu = smp_processor_id();
+	int idle;
+
+	BUG_ON(!irqs_disabled());
+	if (sysctl_hz_timer != 0)
+		return 0;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	smp_mb();
+	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+	    (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		next = jiffies;
+		idle = 0;
+	} else
+		idle = 1;
+
+	/* Convert jiffies to the real cycle counter. */
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		real_cycles_expiry = real_cycles_accounted_system +
+			(long)(next - jiffies) * cycles_per_jiffy;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	/* This cpu is going idle. Disable the periodic alarm. */
+	if (idle) {
+		vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+		per_cpu(idle_start_jiffies, cpu) = jiffies;
+	}
+
+	/* Set the real time alarm to expire at the next event. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
+		      real_cycles_expiry, 0);
+
+	return idle;
+}
+
+static void vmi_reenable_hz_timer(int cpu)
+{
+	/* For /proc/vmi/info idle_hz stat. */
+	per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
+	per_cpu(vmi_idle_no_hz_irqs, cpu)++;
+
+	/* Don't bother explicitly cancelling the one-shot alarm -- at
+	 * worse we will receive a spurious timer interrupt. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+		      cycles_per_alarm);
+	/* Indicate this cpu is no longer nohz idle. */
+	cpu_clear(cpu, nohz_cpu_mask);
+}
+
+/* Called from interrupt handlers when (local) HZ timer is disabled. */
+void vmi_account_time_restart_hz_timer(void)
+{
+	unsigned long long cur_real_cycles, cur_process_times_cycles;
+	int cpu = smp_processor_id();
+
+	BUG_ON(!irqs_disabled());
+	/* Account the time during which the HZ timer was disabled. */
+	cur_real_cycles = read_real_cycles();
+	cur_process_times_cycles = read_available_cycles();
+	/* Update system wide (real) time state (xtime, jiffies). */
+	vmi_account_real_cycles(cur_real_cycles);
+	/* Update per-cpu idle times. */
+	vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
+        /* Update time stolen from this cpu by the hypervisor. */
+	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+	/* Reenable the hz timer. */
+	vmi_reenable_hz_timer(cpu);
+}
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
+ * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
+ * APIC setup and setup_boot_vmi_alarm() is called.  */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+	vmi_local_timer_interrupt(smp_processor_id());
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
+ * Also used in UP when CONFIG_X86_LOCAL_APIC.
+ * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
+void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	int cpu = smp_processor_id();
+
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+        per_cpu(irq_stat,cpu).apic_timer_irqs++;
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	irq_enter();
+	vmi_local_timer_interrupt(cpu);
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+
+#endif  /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index a53c8b1854b5..ca51610955df 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
 {
   . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
   phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+  	_text = .;			/* Text and read-only data */
+	*(.text.head)
+  } :text = 0x9090
+
   /* read-only */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
-  	_text = .;			/* Text and read-only data */
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
@@ -181,12 +186,14 @@ SECTIONS
      from .altinstructions and .eh_frame */
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
   .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+#if defined(CONFIG_BLK_DEV_INITRD)
   . = ALIGN(4096);
   .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
 	__initramfs_start = .;
 	*(.init.ramfs)
 	__initramfs_end = .;
   }
+#endif
   . = ALIGN(L1_CACHE_BYTES);
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c
index cc2f519b2f7f..c78816210706 100644
--- a/arch/i386/mach-default/setup.c
+++ b/arch/i386/mach-default/setup.c
@@ -79,7 +79,12 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING,
+	.mask = CPU_MASK_NONE,
+	.name = "timer"
+};
 
 /**
  * time_init_hook - do any specific initialisations for the system timer.
@@ -90,6 +95,7 @@ static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE,
  **/
 void __init time_init_hook(void)
 {
+	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
diff --git a/arch/i386/mach-es7000/es7000.h b/arch/i386/mach-es7000/es7000.h
index 80566ca4a80a..c8d5aa132fa0 100644
--- a/arch/i386/mach-es7000/es7000.h
+++ b/arch/i386/mach-es7000/es7000.h
@@ -84,15 +84,6 @@ struct es7000_oem_table {
 };
 
 #ifdef CONFIG_ACPI
-struct acpi_table_sdt {
-	unsigned long pa;
-	unsigned long count;
-	struct {
-		unsigned long pa;
-		enum acpi_table_id id;
-		unsigned long size;
-	}	entry[50];
-};
 
 struct oem_table {
 	struct acpi_table_header Header;
diff --git a/arch/i386/mach-es7000/es7000plat.c b/arch/i386/mach-es7000/es7000plat.c
index 3d0fc853516d..9be6ceabf042 100644
--- a/arch/i386/mach-es7000/es7000plat.c
+++ b/arch/i386/mach-es7000/es7000plat.c
@@ -160,51 +160,14 @@ parse_unisys_oem (char *oemptr)
 int __init
 find_unisys_acpi_oem_table(unsigned long *oem_addr)
 {
-	struct acpi_table_rsdp		*rsdp = NULL;
-	unsigned long			rsdp_phys = 0;
-	struct acpi_table_header 	*header = NULL;
-	int				i;
-	struct acpi_table_sdt		sdt;
-
-	rsdp_phys = acpi_find_rsdp();
-	rsdp = __va(rsdp_phys);
-	if (rsdp->rsdt_address) {
-		struct acpi_table_rsdt	*mapped_rsdt = NULL;
-		sdt.pa = rsdp->rsdt_address;
-
-		header = (struct acpi_table_header *)
-			__acpi_map_table(sdt.pa, sizeof(struct acpi_table_header));
-		if (!header)
-			return -ENODEV;
-
-		sdt.count = (header->length - sizeof(struct acpi_table_header)) >> 3;
-		mapped_rsdt = (struct acpi_table_rsdt *)
-			__acpi_map_table(sdt.pa, header->length);
-		if (!mapped_rsdt)
-			return -ENODEV;
-
-		header = &mapped_rsdt->header;
-
-		for (i = 0; i < sdt.count; i++)
-			sdt.entry[i].pa = (unsigned long) mapped_rsdt->entry[i];
-	};
-	for (i = 0; i < sdt.count; i++) {
-
-		header = (struct acpi_table_header *)
-			__acpi_map_table(sdt.entry[i].pa,
-				sizeof(struct acpi_table_header));
-		if (!header)
-			continue;
-		if (!strncmp((char *) &header->signature, "OEM1", 4)) {
-			if (!strncmp((char *) &header->oem_id, "UNISYS", 6)) {
-				void *addr;
-				struct oem_table *t;
-				acpi_table_print(header, sdt.entry[i].pa);
-				t = (struct oem_table *) __acpi_map_table(sdt.entry[i].pa, header->length);
-				addr = (void *) __acpi_map_table(t->OEMTableAddr, t->OEMTableSize);
-				*oem_addr = (unsigned long) addr;
-				return 0;
-			}
+	struct acpi_table_header *header = NULL;
+	int i = 0;
+	while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
+		if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
+			struct oem_table *t = (struct oem_table *)header;
+			*oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr,
+								    t->OEMTableSize);
+			return 0;
 		}
 	}
 	return -1;
diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c
index 9819b705efa4..2e2c51a8bd3a 100644
--- a/arch/i386/math-emu/get_address.c
+++ b/arch/i386/math-emu/get_address.c
@@ -56,15 +56,14 @@ static int reg_offset_vm86[] = {
 #define VM86_REG_(x) (*(unsigned short *) \
 		      (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
 
-/* These are dummy, fs and gs are not saved on the stack. */
-#define ___FS ___ds
+/* This dummy, gs is not saved on the stack. */
 #define ___GS ___ds
 
 static int reg_offset_pm[] = {
 	offsetof(struct info,___cs),
 	offsetof(struct info,___ds),
 	offsetof(struct info,___es),
-	offsetof(struct info,___FS),
+	offsetof(struct info,___fs),
 	offsetof(struct info,___GS),
 	offsetof(struct info,___ss),
 	offsetof(struct info,___ds)
@@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm, u_char segment,
 
   switch ( segment )
     {
-      /* fs and gs aren't used by the kernel, so they still have their
-	 user-space values. */
-    case PREFIX_FS_-1:
-      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
-      savesegment(fs, addr->selector);
-      break;
+      /* gs isn't used by the kernel, so it still has its
+	 user-space value. */
     case PREFIX_GS_-1:
+      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
       savesegment(gs, addr->selector);
       break;
     default:
diff --git a/arch/i386/math-emu/status_w.h b/arch/i386/math-emu/status_w.h
index 78d7b7689dd6..59e73302aa60 100644
--- a/arch/i386/math-emu/status_w.h
+++ b/arch/i386/math-emu/status_w.h
@@ -48,9 +48,11 @@
 
 #define status_word() \
   ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
-#define setcc(cc) ({ \
-  partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); \
-  partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); })
+static inline void setcc(int cc)
+{
+	partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3);
+	partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3);
+}
 
 #ifdef PECULIAR_486
    /* Default, this conveys no information, but an 80486 does it. */
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index e0c390d6ceb5..aa58720f6871 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -101,7 +101,6 @@ extern void find_max_pfn(void);
 extern void add_one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
-extern unsigned long init_pg_tables_end;
 extern unsigned long highend_pfn, highstart_pfn;
 extern unsigned long max_low_pfn;
 extern unsigned long totalram_pages;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index aaaa4d225f7e..b8c4e259fc8b 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -46,43 +46,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
-static inline int notify_page_fault(enum die_val val, const char *str,
-			struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs, long err)
 {
 	struct die_args args = {
 		.regs = regs,
-		.str = str,
+		.str = "page fault",
 		.err = err,
-		.trapnr = trap,
-		.signr = sig
+		.trapnr = 14,
+		.signr = SIGSEGV
 	};
-	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
-}
-
-/*
- * Unlock any spinlocks which will prevent us from getting the
- * message out 
- */
-void bust_spinlocks(int yes)
-{
-	int loglevel_save = console_loglevel;
-
-	if (yes) {
-		oops_in_progress = 1;
-		return;
-	}
-#ifdef CONFIG_VT
-	unblank_screen();
-#endif
-	oops_in_progress = 0;
-	/*
-	 * OK, the message is on the console.  Now we call printk()
-	 * without oops_in_progress set so that printk will give klogd
-	 * a poke.  Hold onto your hats...
-	 */
-	console_loglevel = 15;		/* NMI oopser may have shut the console up */
-	printk(" ");
-	console_loglevel = loglevel_save;
+	return atomic_notifier_call_chain(&notify_page_fault_chain,
+	                                  DIE_PAGE_FAULT, &args);
 }
 
 /*
@@ -353,8 +327,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	if (unlikely(address >= TASK_SIZE)) {
 		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
 			return;
-		if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
-						SIGSEGV) == NOTIFY_STOP)
+		if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -363,8 +336,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
-	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
-					SIGSEGV) == NOTIFY_STOP)
+	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index e0fa6cb655a8..bb2de1089add 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -33,13 +33,14 @@ void *kmap_atomic(struct page *page, enum km_type type)
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
+
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	BUG_ON(!pte_none(*(kmap_pte-idx)));
+
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	if (!pte_none(*(kmap_pte-idx)))
-		BUG();
 	set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
 
 	return (void*) vaddr;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index c5c5ea700cc7..ae436882af7a 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 		
 #ifdef CONFIG_X86_PAE
 	pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+	paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
 	set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 	pud = pud_offset(pgd, 0);
 	if (pmd_table != pmd_offset(pud, 0)) 
@@ -82,6 +83,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
 	if (pmd_none(*pmd)) {
 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		if (page_table != pte_offset_kernel(pmd, 0))
 			BUG();	
@@ -345,6 +347,8 @@ static void __init pagetable_init (void)
 	/* Init entries of the first-level page table to the zero page */
 	for (i = 0; i < PTRS_PER_PGD; i++)
 		set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#else
+	paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
 #endif
 
 	/* Enable PSE if available */
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index ad91528bdc14..412ebbd8adb0 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,6 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
+	paravirt_alloc_pt(page_to_pfn(base));
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                           addr == address ? prot : ref_prot));
@@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
 	if (!PageReserved(kpte_page)) {
 		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
 			ClearPagePrivate(kpte_page);
+			paravirt_release_pt(page_to_pfn(kpte_page));
 			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
@@ -224,7 +226,7 @@ void global_flush_tlb(void)
 	list_replace_init(&df_list, &l);
 	spin_unlock_irq(&cpa_lock);
 	if (!cpu_has_clflush)
-		flush_map(0);
+		flush_map(NULL);
 	list_for_each_entry_safe(pg, next, &l, lru) {
 		if (cpu_has_clflush)
 			flush_map(page_address(pg));
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index f349eaf450b0..fa0cfbd551e1 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 void reserve_top_address(unsigned long reserve)
 {
 	BUG_ON(fixmaps > 0);
+	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+	       (int)-reserve);
 #ifdef CONFIG_COMPAT_VDSO
 	BUG_ON(reserve != 0);
 #else
@@ -248,9 +250,15 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			KERNEL_PGD_PTRS);
+
 	if (PTRS_PER_PMD > 1)
 		return;
 
+	/* must happen under lock */
+	paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+			__pa(swapper_pg_dir) >> PAGE_SHIFT,
+			USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
+
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
@@ -260,6 +268,7 @@ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 {
 	unsigned long flags; /* can be called from interrupt context */
 
+	paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
 	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -277,13 +286,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
 		if (!pmd)
 			goto out_oom;
+		paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
 	}
 	return pgd;
 
 out_oom:
-	for (i--; i >= 0; i--)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	for (i--; i >= 0; i--) {
+		pgd_t pgdent = pgd[i];
+		void* pmd = (void *)__va(pgd_val(pgdent)-1);
+		paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+		kmem_cache_free(pmd_cache, pmd);
+	}
 	kmem_cache_free(pgd_cache, pgd);
 	return NULL;
 }
@@ -294,8 +308,12 @@ void pgd_free(pgd_t *pgd)
 
 	/* in the PAE case user pgd entries are overwritten before usage */
 	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+			pgd_t pgdent = pgd[i];
+			void* pmd = (void *)__va(pgd_val(pgdent)-1);
+			paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+			kmem_cache_free(pmd_cache, pmd);
+		}
 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
 	kmem_cache_free(pgd_cache, pgd);
 }
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 3700eef78743..8fda7be9dd4d 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -63,7 +63,7 @@ static struct sys_device device_oprofile = {
 };
 
 
-static int __init init_driverfs(void)
+static int __init init_sysfs(void)
 {
 	int error;
 	if (!(error = sysdev_class_register(&oprofile_sysclass)))
@@ -72,15 +72,15 @@ static int __init init_driverfs(void)
 }
 
 
-static void exit_driverfs(void)
+static void exit_sysfs(void)
 {
 	sysdev_unregister(&device_oprofile);
 	sysdev_class_unregister(&oprofile_sysclass);
 }
 
 #else
-#define init_driverfs() do { } while (0)
-#define exit_driverfs() do { } while (0)
+#define init_sysfs() do { } while (0)
+#define exit_sysfs() do { } while (0)
 #endif /* CONFIG_PM */
 
 static int profile_exceptions_notify(struct notifier_block *self,
@@ -385,7 +385,7 @@ static int __init ppro_init(char ** cpu_type)
 	return 1;
 }
 
-/* in order to get driverfs right */
+/* in order to get sysfs right */
 static int using_nmi;
 
 int __init op_nmi_init(struct oprofile_operations *ops)
@@ -440,7 +440,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 			return -ENODEV;
 	}
 
-	init_driverfs();
+	init_sysfs();
 	using_nmi = 1;
 	ops->create_files = nmi_create_files;
 	ops->setup = nmi_setup;
@@ -456,5 +456,5 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 void op_nmi_exit(void)
 {
 	if (using_nmi)
-		exit_driverfs();
+		exit_sysfs();
 }
diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c
index ca2447e05e15..c554f52cb808 100644
--- a/arch/i386/oprofile/op_model_ppro.c
+++ b/arch/i386/oprofile/op_model_ppro.c
@@ -24,7 +24,8 @@
 
 #define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
 #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
-#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0)
+#define CTR_32BIT_WRITE(l,msrs,c)	\
+	do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
 #define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
 
 #define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
@@ -79,7 +80,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
 	for (i = 0; i < NUM_COUNTERS; ++i) {
 		if (unlikely(!CTR_IS_RESERVED(msrs,i)))
 			continue;
-		CTR_WRITE(1, msrs, i);
+		CTR_32BIT_WRITE(1, msrs, i);
 	}
 
 	/* enable active counters */
@@ -87,7 +88,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
 		if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
 			reset_value[i] = counter_config[i].count;
 
-			CTR_WRITE(counter_config[i].count, msrs, i);
+			CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
 
 			CTRL_READ(low, high, msrs, i);
 			CTRL_CLEAR(low);
@@ -116,7 +117,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 		CTR_READ(low, high, msrs, i);
 		if (CTR_OVERFLOWED(low)) {
 			oprofile_add_sample(regs, i);
-			CTR_WRITE(reset_value[i], msrs, i);
+			CTR_32BIT_WRITE(reset_value[i], msrs, i);
 		}
 	}
 
diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile
index 1594d2f55c8f..44650e03308b 100644
--- a/arch/i386/pci/Makefile
+++ b/arch/i386/pci/Makefile
@@ -1,7 +1,7 @@
 obj-y				:= i386.o init.o
 
 obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
-obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
 pci-y				:= fixup.o
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 53ca6e897984..1bb069372143 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -191,6 +191,94 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
 		},
 	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL20p G3",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G3"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL20p G4",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G4"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL30p G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL30p G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL25p G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL25p G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL35p G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL35p G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL45p G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL45p G2",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G2"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL460c G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL460c G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL465c G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL465c G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL480c G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL480c G1"),
+		},
+	},
+	{
+		.callback = set_bf_sort,
+		.ident = "HP ProLiant BL685c G1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"),
+		},
+	},
 	{}
 };
 
diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c
new file mode 100644
index 000000000000..747d8c63b0c4
--- /dev/null
+++ b/arch/i386/pci/mmconfig-shared.c
@@ -0,0 +1,264 @@
+/*
+ * mmconfig-shared.c - Low-level direct PCI config space access via
+ *                     MMCONFIG - common code between i386 and x86-64.
+ *
+ * This code does:
+ * - known chipset handling
+ * - ACPI decoding and validation
+ *
+ * Per-architecture code takes care of the mappings and accesses
+ * themselves.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <asm/e820.h>
+
+#include "pci.h"
+
+/* aperture is up to 256MB but BIOS may reserve less */
+#define MMCONFIG_APER_MIN	(2 * 1024*1024)
+#define MMCONFIG_APER_MAX	(256 * 1024*1024)
+
+DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+/* K8 systems have some devices (typically in the builtin northbridge)
+   that are only accessible using type1
+   Normally this can be expressed in the MCFG by not listing them
+   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
+   Instead try to discover all devices on bus 0 that are unreachable using MM
+   and fallback for them. */
+static void __init unreachable_devices(void)
+{
+	int i, bus;
+	/* Use the max bus number from ACPI here? */
+	for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) {
+		for (i = 0; i < 32; i++) {
+			unsigned int devfn = PCI_DEVFN(i, 0);
+			u32 val1, val2;
+
+			pci_conf1_read(0, bus, devfn, 0, 4, &val1);
+			if (val1 == 0xffffffff)
+				continue;
+
+			if (pci_mmcfg_arch_reachable(0, bus, devfn)) {
+				raw_pci_ops->read(0, bus, devfn, 0, 4, &val2);
+				if (val1 == val2)
+					continue;
+			}
+			set_bit(i + 32 * bus, pci_mmcfg_fallback_slots);
+			printk(KERN_NOTICE "PCI: No mmconfig possible on device"
+			       " %02x:%02x\n", bus, i);
+		}
+	}
+}
+
+static const char __init *pci_mmcfg_e7520(void)
+{
+	u32 win;
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
+
+	pci_mmcfg_config_num = 1;
+	pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+	if (!pci_mmcfg_config)
+		return NULL;
+	pci_mmcfg_config[0].address = (win & 0xf000) << 16;
+	pci_mmcfg_config[0].pci_segment = 0;
+	pci_mmcfg_config[0].start_bus_number = 0;
+	pci_mmcfg_config[0].end_bus_number = 255;
+
+	return "Intel Corporation E7520 Memory Controller Hub";
+}
+
+static const char __init *pci_mmcfg_intel_945(void)
+{
+	u32 pciexbar, mask = 0, len = 0;
+
+	pci_mmcfg_config_num = 1;
+
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
+
+	/* Enable bit */
+	if (!(pciexbar & 1))
+		pci_mmcfg_config_num = 0;
+
+	/* Size bits */
+	switch ((pciexbar >> 1) & 3) {
+	case 0:
+		mask = 0xf0000000U;
+		len  = 0x10000000U;
+		break;
+	case 1:
+		mask = 0xf8000000U;
+		len  = 0x08000000U;
+		break;
+	case 2:
+		mask = 0xfc000000U;
+		len  = 0x04000000U;
+		break;
+	default:
+		pci_mmcfg_config_num = 0;
+	}
+
+	/* Errata #2, things break when not aligned on a 256Mb boundary */
+	/* Can only happen in 64M/128M mode */
+
+	if ((pciexbar & mask) & 0x0fffffffU)
+		pci_mmcfg_config_num = 0;
+
+	if (pci_mmcfg_config_num) {
+		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+		if (!pci_mmcfg_config)
+			return NULL;
+		pci_mmcfg_config[0].address = pciexbar & mask;
+		pci_mmcfg_config[0].pci_segment = 0;
+		pci_mmcfg_config[0].start_bus_number = 0;
+		pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
+	}
+
+	return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
+}
+
+struct pci_mmcfg_hostbridge_probe {
+	u32 vendor;
+	u32 device;
+	const char *(*probe)(void);
+};
+
+static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
+};
+
+static int __init pci_mmcfg_check_hostbridge(void)
+{
+	u32 l;
+	u16 vendor, device;
+	int i;
+	const char *name;
+
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
+	vendor = l & 0xffff;
+	device = (l >> 16) & 0xffff;
+
+	pci_mmcfg_config_num = 0;
+	pci_mmcfg_config = NULL;
+	name = NULL;
+
+	for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
+		if (pci_mmcfg_probes[i].vendor == vendor &&
+		    pci_mmcfg_probes[i].device == device)
+			name = pci_mmcfg_probes[i].probe();
+	}
+
+	if (name) {
+		printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
+		       name, pci_mmcfg_config_num ? "with" : "without");
+	}
+
+	return name != NULL;
+}
+
+static void __init pci_mmcfg_insert_resources(void)
+{
+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+	int i;
+	struct resource *res;
+	char *names;
+	unsigned num_buses;
+
+	res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
+			pci_mmcfg_config_num, GFP_KERNEL);
+	if (!res) {
+		printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+		return;
+	}
+
+	names = (void *)&res[pci_mmcfg_config_num];
+	for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
+		struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
+		num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
+		res->name = names;
+		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
+			 cfg->pci_segment);
+		res->start = cfg->address;
+		res->end = res->start + (num_buses << 20) - 1;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		insert_resource(&iomem_resource, res);
+		names += PCI_MMCFG_RESOURCE_NAME_LEN;
+	}
+}
+
+static void __init pci_mmcfg_reject_broken(int type)
+{
+	typeof(pci_mmcfg_config[0]) *cfg;
+
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].address == 0))
+		return;
+
+	cfg = &pci_mmcfg_config[0];
+
+	/*
+	 * Handle more broken MCFG tables on Asus etc.
+	 * They only contain a single entry for bus 0-0.
+	 */
+	if (pci_mmcfg_config_num == 1 &&
+	    cfg->pci_segment == 0 &&
+	    (cfg->start_bus_number | cfg->end_bus_number) == 0) {
+		printk(KERN_ERR "PCI: start and end of bus number is 0. "
+		       "Rejected as broken MCFG.\n");
+		goto reject;
+	}
+
+	/*
+	 * Only do this check when type 1 works. If it doesn't work
+	 * assume we run on a Mac and always use MCFG
+	 */
+	if (type == 1 && !e820_all_mapped(cfg->address,
+					  cfg->address + MMCONFIG_APER_MIN,
+					  E820_RESERVED)) {
+		printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
+		       " E820-reserved\n", cfg->address);
+		goto reject;
+	}
+	return;
+
+reject:
+	printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+	kfree(pci_mmcfg_config);
+	pci_mmcfg_config = NULL;
+	pci_mmcfg_config_num = 0;
+}
+
+void __init pci_mmcfg_init(int type)
+{
+	int known_bridge = 0;
+
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return;
+
+	if (type == 1 && pci_mmcfg_check_hostbridge())
+		known_bridge = 1;
+
+	if (!known_bridge) {
+		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+		pci_mmcfg_reject_broken(type);
+	}
+
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].address == 0))
+		return;
+
+	if (pci_mmcfg_arch_init()) {
+		if (type == 1)
+			unreachable_devices();
+		if (known_bridge)
+			pci_mmcfg_insert_resources();
+		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+	}
+}
diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c
index e2616a266e13..bb1afd9e589d 100644
--- a/arch/i386/pci/mmconfig.c
+++ b/arch/i386/pci/mmconfig.c
@@ -15,55 +15,33 @@
 #include <asm/e820.h>
 #include "pci.h"
 
-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN	(2 * 1024*1024)
-#define MMCONFIG_APER_MAX	(256 * 1024*1024)
-
 /* Assume systems with more busses have correct MCFG */
-#define MAX_CHECK_BUS 16
-
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
 
 /* The base address of the last MMCONFIG device accessed */
 static u32 mmcfg_last_accessed_device;
 static int mmcfg_last_accessed_cpu;
 
-static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
-
 /*
  * Functions for accessing PCI configuration space with MMCONFIG accesses
  */
 static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
 {
-	int cfg_num = -1;
-	struct acpi_table_mcfg_config *cfg;
+	struct acpi_mcfg_allocation *cfg;
+	int cfg_num;
 
-	if (seg == 0 && bus < MAX_CHECK_BUS &&
-	    test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
+	if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+	    test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
 		return 0;
 
-	while (1) {
-		++cfg_num;
-		if (cfg_num >= pci_mmcfg_config_num) {
-			break;
-		}
+	for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
 		cfg = &pci_mmcfg_config[cfg_num];
-		if (cfg->pci_segment_group_number != seg)
-			continue;
-		if ((cfg->start_bus_number <= bus) &&
+		if (cfg->pci_segment == seg &&
+		    (cfg->start_bus_number <= bus) &&
 		    (cfg->end_bus_number >= bus))
-			return cfg->base_address;
+			return cfg->address;
 	}
 
-	/* Handle more broken MCFG tables on Asus etc.
-	   They only contain a single entry for bus 0-0. Assume
- 	   this applies to all busses. */
-	cfg = &pci_mmcfg_config[0];
-	if (pci_mmcfg_config_num == 1 &&
-		cfg->pci_segment_group_number == 0 &&
-		(cfg->start_bus_number | cfg->end_bus_number) == 0)
-		return cfg->base_address;
-
 	/* Fall back to type 0 */
 	return 0;
 }
@@ -125,7 +103,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	u32 base;
 
-	if ((bus > 255) || (devfn > 255) || (reg > 4095)) 
+	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
 	base = get_base_addr(seg, bus, devfn);
@@ -158,67 +136,15 @@ static struct pci_raw_ops pci_mmcfg = {
 	.write =	pci_mmcfg_write,
 };
 
-/* K8 systems have some devices (typically in the builtin northbridge)
-   that are only accessible using type1
-   Normally this can be expressed in the MCFG by not listing them
-   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
-   Instead try to discover all devices on bus 0 that are unreachable using MM
-   and fallback for them. */
-static __init void unreachable_devices(void)
+int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+				    unsigned int devfn)
 {
-	int i, k;
-	unsigned long flags;
-
-	for (k = 0; k < MAX_CHECK_BUS; k++) {
-		for (i = 0; i < 32; i++) {
-			u32 val1;
-			u32 addr;
-
-			pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
-			if (val1 == 0xffffffff)
-				continue;
-
-			/* Locking probably not needed, but safer */
-			spin_lock_irqsave(&pci_config_lock, flags);
-			addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
-			if (addr != 0)
-				pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
-			if (addr == 0 ||
-			    readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
-				set_bit(i + 32*k, fallback_slots);
-				printk(KERN_NOTICE
-			"PCI: No mmconfig possible on %x:%x\n", k, i);
-			}
-			spin_unlock_irqrestore(&pci_config_lock, flags);
-		}
-	}
+	return get_base_addr(seg, bus, devfn) != 0;
 }
 
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_init(void)
 {
-	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
-		return;
-
-	acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
-	if ((pci_mmcfg_config_num == 0) ||
-	    (pci_mmcfg_config == NULL) ||
-	    (pci_mmcfg_config[0].base_address == 0))
-		return;
-
-	/* Only do this check when type 1 works. If it doesn't work
-	   assume we run on a Mac and always use MCFG */
-	if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address,
-			pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
-			E820_RESERVED)) {
-		printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
-				pci_mmcfg_config[0].base_address);
-		printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
-		return;
-	}
-
 	printk(KERN_INFO "PCI: Using MMCONFIG\n");
 	raw_pci_ops = &pci_mmcfg;
-	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
-
-	unreachable_devices();
+	return 1;
 }
diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h
index a0a25180b61a..e58bae2076ad 100644
--- a/arch/i386/pci/pci.h
+++ b/arch/i386/pci/pci.h
@@ -94,3 +94,13 @@ extern void pci_pcbios_init(void);
 extern void pci_mmcfg_init(int type);
 extern void pcibios_sort(void);
 
+/* pci-mmconfig.c */
+
+/* Verify the first 16 busses. We assume that systems with more busses
+   get MCFG right. */
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+					   unsigned int devfn);
+extern int __init pci_mmcfg_arch_init(void);