From 3dc68d9b58ae644cee8e218e3dcde0dceb5c47a3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86-64: revert x86_64-mm-add-genapic_force

This is obsoleted by new Ingo genapic patches.

Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/genapic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h
index b80f4bb5f273..a0e9a4b93484 100644
--- a/include/asm-x86_64/genapic.h
+++ b/include/asm-x86_64/genapic.h
@@ -30,6 +30,6 @@ struct genapic {
 };
 
 
-extern struct genapic *genapic, *genapic_force, apic_flat;
+extern struct genapic *genapic;
 
 #endif
-- 
cgit v1.2.3


From a86f34b49f32b238d16b2e3bf6c9a5391a3f683f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86: revert
 x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525

Obsoleted by Ingo's genapic stuff.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/acpi/earlyquirk.c | 21 -----------------
 arch/i386/kernel/quirks.c          | 46 +++++++++-----------------------------
 arch/i386/kernel/smpboot.c         |  6 -----
 arch/x86_64/kernel/early-quirks.c  | 13 -----------
 arch/x86_64/kernel/smpboot.c       |  8 -------
 include/asm-i386/genapic.h         |  2 +-
 include/asm-i386/irq.h             |  2 --
 include/asm-x86_64/proto.h         |  1 -
 8 files changed, 12 insertions(+), 87 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 8f7efd38254d..23f78efc577d 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
 #include <asm/pci-direct.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
-#include <asm/irq.h>
 
 #ifdef CONFIG_ACPI
 
@@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device)
 	return 0;
 }
 
-static void check_intel(void)
-{
-	u16 vendor, device;
-
-	vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
-
-	if (vendor != PCI_VENDOR_ID_INTEL)
-		return;
-
-	device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-#ifdef CONFIG_SMP
-	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
-		quirk_intel_irqbalance();
-#endif
-}
-
 void __init check_acpi_pci(void)
 {
 	int num, slot, func;
@@ -77,8 +58,6 @@ void __init check_acpi_pci(void)
 	if (!early_pci_allowed())
 		return;
 
-	check_intel();
-
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index a01320a7b636..9f6ab1789bb0 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,23 +3,10 @@
  */
 #include <linux/pci.h>
 #include <linux/irq.h>
-#include <asm/pci-direct.h>
-#include <asm/genapic.h>
-#include <asm/cpu.h>
 
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
-{
-#ifdef CONFIG_X86_64
-	if (genapic !=  &apic_flat)
-		panic("APIC mode must be flat on this system\n");
-#elif defined(CONFIG_X86_GENERICARCH)
-	if (genapic != &apic_default)
-		panic("APIC mode must be default(flat) on this system. Use apic=default\n");
-#endif
-}
 
-void __init quirk_intel_irqbalance(void)
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
 	u8 config, rev;
 	u32 word;
@@ -29,18 +16,18 @@ void __init quirk_intel_irqbalance(void)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
+	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
 	if (rev > 0x9)
 		return;
 
 	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
 
-	/* enable access to config space */
-	config = read_pci_config_byte(0, 0, 0, 0xf4);
-	write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
+	/* enable access to config space*/
+	pci_read_config_byte(dev, 0xf4, &config);
+	pci_write_config_byte(dev, 0xf4, config|0x2);
 
 	/* read xTPR register */
-	word = read_pci_config_16(0, 0, 0x40, 0x4c);
+	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
 
 	if (!(word & (1 << 13))) {
 		printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -50,25 +37,14 @@ void __init quirk_intel_irqbalance(void)
 		noirqdebug_setup("");
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
-#endif
-#ifdef CONFIG_HOTPLUG_CPU
-		printk(KERN_INFO "Disabling cpu hotplug control\n");
-		enable_cpu_hotplug = 0;
-#endif
-#ifdef CONFIG_X86_64
-		/* force the genapic selection to flat mode so that
-		 * interrupts can be redirected to more than one CPU.
-		 */
-		genapic_force = &apic_flat;
 #endif
 	}
 
-	/* put back the original value for config space */
+	/* put back the original value for config space*/
 	if (!(config & 0x2))
-		write_pci_config_byte(0, 0, 0, 0xf4, config);
+		pci_write_config_byte(dev, 0xf4, config);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
-
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
 #endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4ff55e675576..7b14e88b555f 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -54,7 +54,6 @@
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
 #include <asm/pda.h>
-#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -1319,11 +1318,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		touch_nmi_watchdog();
 	}
 
-#ifdef CONFIG_X86_GENERICARCH
-	if (num_online_cpus() > 8 && genapic == &apic_default)
-		panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
-
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index fede55a53995..990d9c218a5d 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -71,18 +71,6 @@ static void __init ati_bugs(void)
 	}
 }
 
-static void intel_bugs(void)
-{
-	u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-
-#ifdef CONFIG_SMP
-	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
-		quirk_intel_irqbalance();
-#endif
-}
-
 struct chipset {
 	u16 vendor;
 	void (*f)(void);
@@ -92,7 +80,6 @@ static struct chipset early_qrk[] __initdata = {
 	{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, via_bugs },
 	{ PCI_VENDOR_ID_ATI, ati_bugs },
-	{ PCI_VENDOR_ID_INTEL, intel_bugs},
 	{}
 };
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index cd4643a37022..14724be48bec 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -60,7 +60,6 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/numa.h>
-#include <asm/genapic.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -965,13 +964,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-
-	if (num_online_cpus() > 8 && genapic == &apic_flat) {
-		printk(KERN_WARNING
-		       "flat APIC routing can't be used with > 8 cpus\n");
-		BUG();
-	}
-
 	err = 0;
 
 	return err;
diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index fd2be593b06e..8ffbb0f07457 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -122,6 +122,6 @@ struct genapic {
 	APICFUNC(phys_pkg_id) \
 	}
 
-extern struct genapic *genapic, apic_default;
+extern struct genapic *genapic;
 
 #endif
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 11761cdaae19..9e15ce0006eb 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -37,8 +37,6 @@ static __inline__ int irq_canonicalize(int irq)
 extern int irqbalance_disable(char *str);
 #endif
 
-extern void quirk_intel_irqbalance(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void fixup_irqs(cpumask_t map);
 #endif
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index b6e65a699f2a..6688cf9eb27b 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -82,7 +82,6 @@ extern void syscall32_cpu_init(void);
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
 
 extern void early_quirks(void);
-extern void quirk_intel_irqbalance(void);
 extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
-- 
cgit v1.2.3


From 07c7c4744400f93a7c52b32159c31d823e1747a5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86-64: always use physical delivery mode on > 8 CPUs

Remove clustered APIC mode.  There's little point in the use of clustered APIC
mode, broadcasting is limited to within the cluster only, and chipsets have
bugs in this area as well.  So default to physical APIC mode when the CPU
count is large, and default to logical APIC mode when the CPU count is 8 or
smaller.

(this patch only removes the use of genapic_cluster and cleans up the
resulting genapic.c file - removal of all remaining traces of clustered
mode will be done by another patch.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/genapic.c | 71 ++++++++++----------------------------------
 include/asm-x86_64/genapic.h |  4 ++-
 2 files changed, 18 insertions(+), 57 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 2f2b8fc6e2f3..025f26ebb8d7 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -11,26 +11,24 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
-#include <linux/module.h>
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+					= { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
-u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
-extern struct genapic apic_cluster;
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
+u8 x86_cpu_to_log_apicid[NR_CPUS]	= { [0 ... NR_CPUS-1] = BAD_APICID };
 
 struct genapic __read_mostly *genapic = &apic_flat;
 
@@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic = &apic_flat;
  */
 void __init clustered_apic_check(void)
 {
-	int i;
-	u8 clusters, max_cluster;
+	unsigned int i, max_apic = 0;
 	u8 id;
-	u8 cluster_cnt[NUM_APIC_CLUSTERS];
-	int max_apic = 0;
 
 #ifdef CONFIG_ACPI
 	/*
-	 * Some x86_64 machines use physical APIC mode regardless of how many
-	 * procs/clusters are present (x86_64 ES7000 is an example).
+	 * Quirk: some x86_64 machines can only use physical APIC mode
+	 * regardless of how many processors are present (x86_64 ES7000
+	 * is an example).
 	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
-		if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
-			genapic = &apic_cluster;
-			goto print;
-		}
+	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+		genapic = &apic_physflat;
 #endif
 
-	memset(cluster_cnt, 0, sizeof(cluster_cnt));
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
 		if (id == BAD_APICID)
 			continue;
 		if (id > max_apic)
 			max_apic = id;
-		cluster_cnt[APIC_CLUSTERID(id)]++;
 	}
 
-	/*
-	 * Don't use clustered mode on AMD platforms, default
-	 * to flat logical mode.
-	 */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		/*
-		 * Switch to physical flat mode if more than 8 APICs
-		 * (In the case of 8 CPUs APIC ID goes from 0 to 7):
-		 */
-		if (max_apic >= 8)
-			genapic = &apic_physflat;
- 		goto print;
- 	}
-
-	clusters = 0;
-	max_cluster = 0;
-
-	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-		if (cluster_cnt[i] > 0) {
-			++clusters;
-			if (cluster_cnt[i] > max_cluster)
-				max_cluster = cluster_cnt[i];
-		}
-	}
-
-	/*
-	 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
-	 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
-	 * else physical mode.
-	 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
-	 * can ignore the clustered logical case and go straight to physical.)
-	 */
-	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+	if (max_apic < 8)
 		genapic = &apic_flat;
 	else
-		genapic = &apic_cluster;
+		genapic = &apic_physflat;
 
-print:
 	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
 }
 
-/* Same for both flat and clustered. */
+/* Same for both flat and physical. */
 
 void send_IPI_self(int vector)
 {
diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h
index a0e9a4b93484..d7e516ccbaa4 100644
--- a/include/asm-x86_64/genapic.h
+++ b/include/asm-x86_64/genapic.h
@@ -29,7 +29,9 @@ struct genapic {
 	unsigned int (*phys_pkg_id)(int index_msb);
 };
 
-
 extern struct genapic *genapic;
 
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+
 #endif
-- 
cgit v1.2.3


From 3c43f03908de98fa8f7a9e8fc9411ebf4c2de298 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86: default to physical mode on hotplug CPU kernels

Default to physical mode on hotplug CPU kernels.  Furher simplify and clean up
the APIC initialization code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/i386/kernel/acpi/boot.c              |  2 +-
 arch/i386/kernel/mpparse.c                |  2 +-
 arch/x86_64/kernel/genapic.c              | 16 +++-------------
 arch/x86_64/kernel/mpparse.c              |  2 +-
 include/asm-i386/genapic.h                |  4 ++--
 include/asm-i386/mach-bigsmp/mach_apic.h  |  2 +-
 include/asm-i386/mach-default/mach_apic.h |  2 +-
 include/asm-i386/mach-es7000/mach_apic.h  |  2 +-
 include/asm-i386/mach-generic/mach_apic.h |  2 +-
 include/asm-i386/mach-numaq/mach_apic.h   |  2 +-
 include/asm-i386/mach-summit/mach_apic.h  |  2 +-
 include/asm-i386/mach-visws/mach_apic.h   |  2 +-
 include/asm-x86_64/apic.h                 |  2 +-
 13 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e1..280898b045b2 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
-				clustered_apic_check();
+				setup_apic_routing();
 			}
 		}
 		if (error == -EINVAL) {
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c98669..0952eccd8f28 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 		}
 		++mpc_record;
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 025f26ebb8d7..c08650a427e2 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic = &apic_flat;
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
-void __init clustered_apic_check(void)
+void __init setup_apic_routing(void)
 {
-	unsigned int i, max_apic = 0;
-	u8 id;
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Quirk: some x86_64 machines can only use physical APIC mode
@@ -49,17 +46,10 @@ void __init clustered_apic_check(void)
 	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
 			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
 		genapic = &apic_physflat;
+	else
 #endif
 
-	for (i = 0; i < NR_CPUS; i++) {
-		id = bios_cpu_apicid[i];
-		if (id == BAD_APICID)
-			continue;
-		if (id > max_apic)
-			max_apic = id;
-	}
-
-	if (max_apic < 8)
+	if (cpus_weight(cpu_possible_map) <= 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 455aa0b932f0..d0dc4891599b 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 			}
 		}
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index 8ffbb0f07457..33e3ffe1766c 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -36,7 +36,7 @@ struct genapic {
 	void (*init_apic_ldr)(void);
 	physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
 
-	void (*clustered_apic_check)(void);
+	void (*setup_apic_routing)(void);
 	int (*multi_timer_check)(int apic, int irq);
 	int (*apicid_to_node)(int logical_apicid); 
 	int (*cpu_to_logical_apicid)(int cpu);
@@ -99,7 +99,7 @@ struct genapic {
 	APICFUNC(check_apicid_present) \
 	APICFUNC(init_apic_ldr) \
 	APICFUNC(ioapic_phys_id_map) \
-	APICFUNC(clustered_apic_check) \
+	APICFUNC(setup_apic_routing) \
 	APICFUNC(multi_timer_check) \
 	APICFUNC(apicid_to_node) \
 	APICFUNC(cpu_to_logical_apicid) \
diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h
index 18b19a773440..ebd319f838ab 100644
--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -71,7 +71,7 @@ static inline void init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"Physflat", nr_ioapics);
diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h
index 3ef6292db780..6db1c3babe9a 100644
--- a/include/asm-i386/mach-default/mach_apic.h
+++ b/include/asm-i386/mach-default/mach_apic.h
@@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
 	return phys_map;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 					"Flat", nr_ioapics);
diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h
index 26333685a7fb..8e8b3949173a 100644
--- a/include/asm-i386/mach-es7000/mach_apic.h
+++ b/include/asm-i386/mach-es7000/mach_apic.h
@@ -81,7 +81,7 @@ static inline void enable_apic_mode(void)
 }
 
 extern int apic_version [MAX_APICS];
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	int apic = bios_cpu_apicid[smp_processor_id()];
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index d9dc039da94a..a236e7021528 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -13,7 +13,7 @@
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
-#define clustered_apic_check (genapic->clustered_apic_check) 
+#define setup_apic_routing (genapic->setup_apic_routing)
 #define multi_timer_check (genapic->multi_timer_check)
 #define apicid_to_node (genapic->apicid_to_node)
 #define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) 
diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h
index 9d158095da82..5e5e7dd2692e 100644
--- a/include/asm-i386/mach-numaq/mach_apic.h
+++ b/include/asm-i386/mach-numaq/mach_apic.h
@@ -34,7 +34,7 @@ static inline void init_apic_ldr(void)
 	/* Already done in NUMA-Q firmware */
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"NUMA-Q", nr_ioapics);
diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h
index 43e5bd8f4a19..732f776aab8e 100644
--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -80,7 +80,7 @@ static inline int apic_id_registered(void)
 	return 1;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n",
 						nr_ioapics);
diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h
index 18afe6b6fc4d..efac6f0d139f 100644
--- a/include/asm-i386/mach-visws/mach_apic.h
+++ b/include/asm-i386/mach-visws/mach_apic.h
@@ -47,7 +47,7 @@ static inline void summit_check(char *oem, char *productid)
 {
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 }
 
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 7cfb39cbd918..2f3b013595ab 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (void);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
-extern void clustered_apic_check(void);
+extern void setup_apic_routing(void);
 
 extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
 				   unsigned char msg_type, unsigned char mask);
-- 
cgit v1.2.3


From 00f1ea696702163b7411d2316264525996c66ed3 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86: adjust inclusion of asm/fixmap.h

Move inclusion of asm/fixmap.h to where it is really used rather than
where it may have been used long ago (requires a few other adjustments
to includes due to previous implicit dependencies).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/genapic.c      | 1 +
 arch/x86_64/kernel/genapic_flat.c | 1 +
 include/asm-i386/hpet.h           | 2 --
 include/asm-i386/kexec.h          | 5 -----
 include/asm-i386/pgalloc.h        | 1 -
 include/asm-i386/smp.h            | 7 ++-----
 include/asm-x86_64/ipi.h          | 4 +---
 include/asm-x86_64/pgalloc.h      | 1 -
 include/asm-x86_64/pgtable.h      | 1 -
 include/asm-x86_64/smp.h          | 3 +--
 10 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index c08650a427e2..47496a40e84f 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -18,6 +18,7 @@
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
+#include <asm/genapic.h>
 
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 7c01db8fa9d1..9e0a552f0e4a 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <asm/smp.h>
 #include <asm/ipi.h>
+#include <asm/genapic.h>
 
 static cpumask_t flat_target_cpus(void)
 {
diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h
index fc03cf9de5c4..dddeedf504b7 100644
--- a/include/asm-i386/hpet.h
+++ b/include/asm-i386/hpet.h
@@ -28,8 +28,6 @@
 
 #include <linux/timex.h>
 
-#include <asm/fixmap.h>
-
 /*
  * Documentation on HPET can be found at:
  *      http://www.intel.com/ial/home/sp/pcmmspec.htm
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
index 4dfc9f5ed031..c5b4ab95bdc2 100644
--- a/include/asm-i386/kexec.h
+++ b/include/asm-i386/kexec.h
@@ -21,7 +21,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <asm/fixmap.h>
 #include <asm/ptrace.h>
 #include <asm/string.h>
 
@@ -29,10 +28,6 @@
  * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
  * I.e. Maximum page that is mapped directly into kernel memory,
  * and kmap is not required.
- *
- * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
- * calculation for the amount of memory directly mappable into the
- * kernel memory space.
  */
 
 /* Maximum physical address we can use pages from */
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index c8dc2d0141a7..47430175b75f 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -1,7 +1,6 @@
 #ifndef _I386_PGALLOC_H
 #define _I386_PGALLOC_H
 
-#include <asm/fixmap.h>
 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
 
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 6bf0033a301c..9cab1531c613 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -11,16 +11,13 @@
 #include <asm/pda.h>
 #endif
 
-#ifdef CONFIG_X86_LOCAL_APIC
-#ifndef __ASSEMBLY__
-#include <asm/fixmap.h>
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
 #include <asm/bitops.h>
 #include <asm/mpspec.h>
+#include <asm/apic.h>
 #ifdef CONFIG_X86_IO_APIC
 #include <asm/io_apic.h>
 #endif
-#include <asm/apic.h>
-#endif
 #endif
 
 #define BAD_APICID 0xFFu
diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h
index 2a5c162b7d92..ffa6f1517f1a 100644
--- a/include/asm-x86_64/ipi.h
+++ b/include/asm-x86_64/ipi.h
@@ -18,10 +18,8 @@
  * Subject to the GNU Public License, v.2
  */
 
-#include <asm/fixmap.h>
 #include <asm/hw_irq.h>
-#include <asm/apicdef.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 /*
  * the following functions deal with sending IPIs between CPUs.
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 4e28b6060a5e..31d497171969 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -1,7 +1,6 @@
 #ifndef _X86_64_PGALLOC_H
 #define _X86_64_PGALLOC_H
 
-#include <asm/fixmap.h>
 #include <asm/pda.h>
 #include <linux/threads.h>
 #include <linux/mm.h>
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 730bd6028416..5957361782fe 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -6,7 +6,6 @@
  * the x86-64 page table tree.
  */
 #include <asm/processor.h>
-#include <asm/fixmap.h>
 #include <asm/bitops.h>
 #include <linux/threads.h>
 #include <asm/pda.h>
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index de592a408c07..f4236d7789aa 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -10,10 +10,9 @@
 #include <linux/init.h>
 extern int disable_apic;
 
-#include <asm/fixmap.h>
 #include <asm/mpspec.h>
-#include <asm/io_apic.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/thread_info.h>
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From b0354795c9c8fef2fadf8f867586c78efd9a1dc9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 2 May 2007 19:27:04 +0200
Subject: [PATCH] x86-64: adjust inclusion of asm/vsyscall32.h

Avoid including asm/vsyscall32.h in virtually every source file.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/ia32/syscall32.c | 1 +
 include/asm-x86_64/fixmap.h  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
index 568ff0df89e7..fc4419ff0355 100644
--- a/arch/x86_64/ia32/syscall32.c
+++ b/arch/x86_64/ia32/syscall32.c
@@ -13,6 +13,7 @@
 #include <asm/proto.h>
 #include <asm/tlbflush.h>
 #include <asm/ia32_unistd.h>
+#include <asm/vsyscall32.h>
 
 extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
 extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h
index 1b620db5b9e3..e90e1677531b 100644
--- a/include/asm-x86_64/fixmap.h
+++ b/include/asm-x86_64/fixmap.h
@@ -15,7 +15,6 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
-#include <asm/vsyscall32.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
-- 
cgit v1.2.3


From 9964cf7d776600724ef5f1b33303ceadc588b8ba Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 2 May 2007 19:27:05 +0200
Subject: [PATCH] x86: consolidate smp_send_stop()

Synchronize i386's smp_send_stop() with x86-64's in only try-locking
the call lock to prevent deadlocks when called from panic().
In both version, disable interrupts before clearing the CPU off the
online map to eliminate races with IRQ handlers inspecting this map.
Also in both versions, save/restore interrupts rather than disabling/
enabling them.
On x86-64, eliminate one function used here by folding it into its
single caller, convert to static, and rename for consistency with i386
(lkcd may like this).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/smp.c   | 68 +++++++++++++++++++++++++++---------------------
 arch/x86_64/kernel/smp.c | 28 +++++++-------------
 include/asm-x86_64/smp.h |  1 -
 3 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 0e8977871b1f..0cd459baad68 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -515,35 +515,14 @@ void unlock_ipi_call_lock(void)
 
 static struct call_data_struct *call_data;
 
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+static void __smp_call_function(void (*func) (void *info), void *info,
+				int nonatomic, int wait)
 {
 	struct call_data_struct data;
-	int cpus;
-
-	/* Holding any lock stops cpus from going down. */
-	spin_lock(&call_lock);
-	cpus = num_online_cpus() - 1;
-	if (!cpus) {
-		spin_unlock(&call_lock);
-		return 0;
-	}
+	int cpus = num_online_cpus() - 1;
 
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	if (!cpus)
+		return;
 
 	data.func = func;
 	data.info = info;
@@ -565,6 +544,30 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 	if (wait)
 		while (atomic_read(&data.finished) != cpus)
 			cpu_relax();
+}
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
+{
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+	__smp_call_function(func, info, nonatomic, wait);
 	spin_unlock(&call_lock);
 
 	return 0;
@@ -573,11 +576,11 @@ EXPORT_SYMBOL(smp_call_function);
 
 static void stop_this_cpu (void * dummy)
 {
+	local_irq_disable();
 	/*
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
 		for(;;) halt();
@@ -590,11 +593,16 @@ static void stop_this_cpu (void * dummy)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	/* Don't deadlock on the call lock in panic */
+	int nolock = !spin_trylock(&call_lock);
+	unsigned long flags;
 
-	local_irq_disable();
+	local_irq_save(flags);
+	__smp_call_function(stop_this_cpu, NULL, 0, 0);
+	if (!nolock)
+		spin_unlock(&call_lock);
 	disable_local_APIC();
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 /*
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index af1ec4d23cf8..bd1d123947ce 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -452,42 +452,34 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 }
 EXPORT_SYMBOL(smp_call_function);
 
-void smp_stop_cpu(void)
+static void stop_this_cpu(void *dummy)
 {
-	unsigned long flags;
+	local_irq_disable();
 	/*
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
-}
-
-static void smp_really_stop_cpu(void *dummy)
-{
-	smp_stop_cpu(); 
 	for (;;) 
 		halt();
 } 
 
 void smp_send_stop(void)
 {
-	int nolock = 0;
+	int nolock;
+	unsigned long flags;
+
 	if (reboot_force)
 		return;
+
 	/* Don't deadlock on the call lock in panic */
-	if (!spin_trylock(&call_lock)) {
-		/* ignore locking because we have panicked anyways */
-		nolock = 1;
-	}
-	__smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
+	nolock = !spin_trylock(&call_lock);
+	local_irq_save(flags);
+	__smp_call_function(stop_this_cpu, NULL, 0, 0);
 	if (!nolock)
 		spin_unlock(&call_lock);
-
-	local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 /*
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index f4236d7789aa..d5704421456b 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -37,7 +37,6 @@ extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
 extern int smp_num_siblings;
 extern void smp_send_reschedule(int cpu);
-void smp_stop_cpu(void);
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
-- 
cgit v1.2.3


From 803d80f65038f77c4681a0d7708e9d693e68aaa8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:05 +0200
Subject: [PATCH] x86-64: Some cleanup in time.c

Move prototypes into header files
Remove unneeded includes.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/time.c  | 5 +----
 include/asm-x86_64/proto.h | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 75d73a9aa9ff..811b8f987b50 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -39,13 +39,10 @@
 #include <asm/proto.h>
 #include <asm/hpet.h>
 #include <asm/sections.h>
-#include <linux/cpufreq.h>
 #include <linux/hpet.h>
 #include <asm/apic.h>
 #include <asm/hpet.h>
-
-extern void i8254_timer_resume(void);
-extern int using_apic_timer;
+#include <asm/mpspec.h>
 
 static char *timename = NULL;
 
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 6688cf9eb27b..f64949fae61a 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -122,6 +122,8 @@ extern void smp_local_timer_interrupt(void);
 
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
 
+void i8254_timer_resume(void);
+
 #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1))
 #define round_down(x,y) ((x) & ~((y)-1))
 
-- 
cgit v1.2.3


From f9d09645d6157fefa18ff75930737060c8092ddb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: Remove unused set_seg_base

The set_seg_base function isn't used anywhere (2.6.21-rc3-git1)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/desc.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index 913d6ac00033..7726e74db536 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -107,16 +107,6 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
 			      DESC_LDT, size * 8 - 1);
 }
 
-static inline void set_seg_base(unsigned cpu, int entry, void *base)
-{ 
-	struct desc_struct *d = &cpu_gdt(cpu)[entry];
-	u32 addr = (u32)(u64)base;
-	BUG_ON((u64)base >> 32); 
-	d->base0 = addr & 0xffff;
-	d->base1 = (addr >> 16) & 0xff;
-	d->base2 = (addr >> 24) & 0xff;
-} 
-
 #define LDT_entry_a(info) \
 	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
 /* Don't allow setting of the lm bit. It is useless anyways because 
-- 
cgit v1.2.3


From fbc16f2c2a0e16dbd75ac85d3b6db97f92b642ba Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: Remove duplicated code for reading control registers

On Tue, Mar 13, 2007 at 05:33:09AM -0700, Randy.Dunlap wrote:
> On Tue, 13 Mar 2007, Glauber de Oliveira Costa wrote:
>
> > Tiny cleanup:
> >
> > In x86_64, the same functions for reading cr3 and writing cr{3,4} are
> > defined in tlbflush.h and system.h, whith just a name change.
> > The only difference is the clobbering of memory, which seems a safe, and
> > even needed change for the write_cr4. This patch removes the duplicate.
> > write_cr3() is moved to system.h for consistency.
>
> missing patch.....
>
thanks. Attached now

--
Glauber de Oliveira Costa
Red Hat Inc.
"Free as in Freedom"

Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/system.h   |  7 ++++++-
 include/asm-x86_64/tlbflush.h | 33 +++++----------------------------
 2 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index bd376bc8c4ab..213b7fe5d998 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -89,6 +89,11 @@ static inline unsigned long read_cr3(void)
 	return cr3;
 } 
 
+static inline void write_cr3(unsigned long val)
+{
+	asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
+}
+
 static inline unsigned long read_cr4(void)
 { 
 	unsigned long cr4;
@@ -98,7 +103,7 @@ static inline unsigned long read_cr4(void)
 
 static inline void write_cr4(unsigned long val)
 { 
-	asm volatile("movq %0,%%cr4" :: "r" (val));
+	asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
 } 
 
 #define stts() write_cr0(8 | read_cr0())
diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h
index 983bd296c81a..512401b8725f 100644
--- a/include/asm-x86_64/tlbflush.h
+++ b/include/asm-x86_64/tlbflush.h
@@ -3,41 +3,18 @@
 
 #include <linux/mm.h>
 #include <asm/processor.h>
-
-static inline unsigned long get_cr3(void)
-{
-	unsigned long cr3;
-	asm volatile("mov %%cr3,%0" : "=r" (cr3));
-	return cr3;
-}
-
-static inline void set_cr3(unsigned long cr3)
-{
-	asm volatile("mov %0,%%cr3" :: "r" (cr3) : "memory");
-}
+#include <asm/system.h>
 
 static inline void __flush_tlb(void)
 {
-	set_cr3(get_cr3());
-}
-
-static inline unsigned long get_cr4(void)
-{
-	unsigned long cr4;
-	asm volatile("mov %%cr4,%0" : "=r" (cr4));
-	return cr4;
-}
-
-static inline void set_cr4(unsigned long cr4)
-{
-	asm volatile("mov %0,%%cr4" :: "r" (cr4) : "memory");
+	write_cr3(read_cr3());
 }
 
 static inline void __flush_tlb_all(void)
 {
-	unsigned long cr4 = get_cr4();
-	set_cr4(cr4 & ~X86_CR4_PGE);	/* clear PGE */
-	set_cr4(cr4);			/* write old PGE again and flush TLBs */
+	unsigned long cr4 = read_cr4();
+	write_cr4(cr4 & ~X86_CR4_PGE);	/* clear PGE */
+	write_cr4(cr4);			/* write old PGE again and flush TLBs */
 }
 
 #define __flush_tlb_one(addr) \
-- 
cgit v1.2.3


From 6b37f5a20c0e5c334c010a587058354215433e92 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: fix cpu MHz reporting on constant_tsc cpus

This patch fixes the reporting of cpu_mhz in /proc/cpuinfo on CPUs with
a constant TSC rate and a kernel with disabled cpufreq.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/x86_64/kernel/apic.c     |    2 -
 arch/x86_64/kernel/time.c     |   58 +++++++++++++++++++++++++++++++++++++++---
 arch/x86_64/kernel/tsc.c      |   12 +++++---
 arch/x86_64/kernel/tsc_sync.c |    2 -
 include/asm-x86_64/proto.h    |    1
 5 files changed, 65 insertions(+), 10 deletions(-)
---
 arch/x86_64/kernel/apic.c     |  2 +-
 arch/x86_64/kernel/time.c     | 58 ++++++++++++++++++++++++++++++++++++++++---
 arch/x86_64/kernel/tsc.c      | 12 +++++----
 arch/x86_64/kernel/tsc_sync.c |  2 +-
 include/asm-x86_64/proto.h    |  1 +
 5 files changed, 65 insertions(+), 10 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index bd3e45d47c37..3421f21b6c70 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -843,7 +843,7 @@ static int __init calibrate_APIC_clock(void)
 		} while ((tsc - tsc_start) < TICK_COUNT &&
 				(apic - apic_start) < TICK_COUNT);
 
-		result = (apic_start - apic) * 1000L * cpu_khz /
+		result = (apic_start - apic) * 1000L * tsc_khz /
 					(tsc - tsc_start);
 	}
 	printk("result %d\n", result);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 811b8f987b50..5f862e216a42 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -43,6 +43,7 @@
 #include <asm/apic.h>
 #include <asm/hpet.h>
 #include <asm/mpspec.h>
+#include <asm/nmi.h>
 
 static char *timename = NULL;
 
@@ -249,6 +250,51 @@ static unsigned long get_cmos_time(void)
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+/* calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency */
+#define TICK_COUNT 100000000
+static unsigned int __init tsc_calibrate_cpu_khz(void)
+{
+       int tsc_start, tsc_now;
+       int i, no_ctr_free;
+       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+       unsigned long flags;
+
+       for (i = 0; i < 4; i++)
+               if (avail_to_resrv_perfctr_nmi_bit(i))
+                       break;
+       no_ctr_free = (i == 4);
+       if (no_ctr_free) {
+               i = 3;
+               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+               wrmsrl(MSR_K7_EVNTSEL3, 0);
+               rdmsrl(MSR_K7_PERFCTR3, pmc3);
+       } else {
+               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+       }
+       local_irq_save(flags);
+       /* start meauring cycles, incrementing from 0 */
+       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+       rdtscl(tsc_start);
+       do {
+               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+               tsc_now = get_cycles_sync();
+       } while ((tsc_now - tsc_start) < TICK_COUNT);
+
+       local_irq_restore(flags);
+       if (no_ctr_free) {
+               wrmsrl(MSR_K7_EVNTSEL3, 0);
+               wrmsrl(MSR_K7_PERFCTR3, pmc3);
+               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+       } else {
+               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+       }
+
+       return pmc_now * tsc_khz / (tsc_now - tsc_start);
+}
 
 /*
  * pit_calibrate_tsc() uses the speaker output (channel 2) of
@@ -336,14 +382,20 @@ void __init time_init(void)
 	if (hpet_use_timer) {
 		/* set tick_nsec to use the proper rate for HPET */
 	  	tick_nsec = TICK_NSEC_HPET;
-		cpu_khz = hpet_calibrate_tsc();
+		tsc_khz = hpet_calibrate_tsc();
 		timename = "HPET";
 	} else {
 		pit_init();
-		cpu_khz = pit_calibrate_tsc();
+		tsc_khz = pit_calibrate_tsc();
 		timename = "PIT";
 	}
 
+	cpu_khz = tsc_khz;
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
+		boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+		boot_cpu_data.x86 == 16)
+		cpu_khz = tsc_calibrate_cpu_khz();
+
 	if (unsynchronized_tsc())
 		mark_tsc_unstable();
 
@@ -352,7 +404,7 @@ void __init time_init(void)
 	else
 		vgetcpu_mode = VGETCPU_LSL;
 
-	set_cyc2ns_scale(cpu_khz);
+	set_cyc2ns_scale(tsc_khz);
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
 		cpu_khz / 1000, cpu_khz % 1000);
 	init_tsc_clocksource();
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 1a0edbbffaa0..5c84992c676d 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -13,6 +13,8 @@ static int notsc __initdata = 0;
 
 unsigned int cpu_khz;		/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
+unsigned int tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
 
 static unsigned int cyc2ns_scale __read_mostly;
 
@@ -77,7 +79,7 @@ static void handle_cpufreq_delayed_get(struct work_struct *v)
 static unsigned int  ref_freq = 0;
 static unsigned long loops_per_jiffy_ref = 0;
 
-static unsigned long cpu_khz_ref = 0;
+static unsigned long tsc_khz_ref = 0;
 
 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				 void *data)
@@ -99,7 +101,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 	if (!ref_freq) {
 		ref_freq = freq->old;
 		loops_per_jiffy_ref = *lpj;
-		cpu_khz_ref = cpu_khz;
+		tsc_khz_ref = tsc_khz;
 	}
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 		(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
@@ -107,12 +109,12 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 		*lpj =
 		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
-		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
 			mark_tsc_unstable();
 	}
 
-	set_cyc2ns_scale(cpu_khz_ref);
+	set_cyc2ns_scale(tsc_khz_ref);
 
 	return 0;
 }
@@ -213,7 +215,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 void __init init_tsc_clocksource(void)
 {
 	if (!notsc) {
-		clocksource_tsc.mult = clocksource_khz2mult(cpu_khz,
+		clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
 							clocksource_tsc.shift);
 		if (check_tsc_unstable())
 			clocksource_tsc.rating = 0;
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
index 014f0db45dfa..72d444dede9b 100644
--- a/arch/x86_64/kernel/tsc_sync.c
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -50,7 +50,7 @@ static __cpuinit void check_tsc_warp(void)
 	/*
 	 * The measurement runs for 20 msecs:
 	 */
-	end = start + cpu_khz * 20ULL;
+	end = start + tsc_khz * 20ULL;
 	now = start;
 
 	for (i = 0; ; i++) {
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index f64949fae61a..78427021d94a 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -92,6 +92,7 @@ extern unsigned long table_start, table_end;
 
 extern int exception_trace;
 extern unsigned cpu_khz;
+extern unsigned tsc_khz;
 
 extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
-- 
cgit v1.2.3


From e65845045588806fa5c8df8a4f4253516515a5e3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: dma_ops as const

The dma_ops structure can be const since it never changes
after boot.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-calgary.c | 2 +-
 arch/x86_64/kernel/pci-gart.c    | 2 +-
 arch/x86_64/kernel/pci-nommu.c   | 2 +-
 arch/x86_64/kernel/pci-swiotlb.c | 2 +-
 arch/x86_64/mm/init.c            | 2 +-
 include/asm-x86_64/dma-mapping.h | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 04480c3b68f5..5bd20b542c1e 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -507,7 +507,7 @@ error:
 	return ret;
 }
 
-static struct dma_mapping_ops calgary_dma_ops = {
+static const struct dma_mapping_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
 	.map_single = calgary_map_single,
 	.unmap_single = calgary_unmap_single,
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 0bae862e9a55..0a762e10f2be 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -556,7 +556,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 extern int agp_amd64_init(void);
 
-static struct dma_mapping_ops gart_dma_ops = {
+static const struct dma_mapping_ops gart_dma_ops = {
 	.mapping_error = NULL,
 	.map_single = gart_map_single,
 	.map_simple = gart_map_simple,
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index df09ab05a1bd..6dade0c867cc 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -79,7 +79,7 @@ void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
 {
 }
 
-struct dma_mapping_ops nommu_dma_ops = {
+const struct dma_mapping_ops nommu_dma_ops = {
 	.map_single = nommu_map_single,
 	.unmap_single = nommu_unmap_single,
 	.map_sg = nommu_map_sg,
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index eb18be5a6569..4b4569abc60c 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -12,7 +12,7 @@
 int swiotlb __read_mostly;
 EXPORT_SYMBOL(swiotlb);
 
-struct dma_mapping_ops swiotlb_dma_ops = {
+const struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index ec31534eb104..5ca61731f550 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -46,7 +46,7 @@
 #define Dprintk(x...)
 #endif
 
-struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops* dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static unsigned long dma_reserve __initdata;
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index d2af227f06d0..6897e2a436e5 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -52,7 +52,7 @@ struct dma_mapping_ops {
 };
 
 extern dma_addr_t bad_dma_address;
-extern struct dma_mapping_ops* dma_ops;
+extern const struct dma_mapping_ops* dma_ops;
 extern int iommu_merge;
 
 static inline int dma_mapping_error(dma_addr_t dma_addr)
-- 
cgit v1.2.3


From 9d291e787b2b71d1b57e5fbb24ba9c70e748ed84 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: Assembly safe page.h and pgtable.h

This patch makes pgtable.h and page.h safe to include
in assembly files like head.S.  Allowing us to use
symbolic constants instead of hard coded numbers when
refering to the page tables.

This patch copies asm-sparc64/const.h to asm-x86_64 to
get a definition of _AC() a very convinient macro that
allows us to force the type when we are compiling the
code in C and to drop all of the type information when
we are using the constant in assembly.  Previously this
was done with multiple definition of the same constant.
const.h was modified slightly so that it works when given
CONFIG options as arguments.

This patch adds #ifndef __ASSEMBLY__ ... #endif
and _AC(1,UL) where appropriate so the assembler won't
choke on the header files.  Otherwise nothing
should have changed.

AK: added const.h to exported headers to fix headers_check

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/Kbuild    |  1 +
 include/asm-x86_64/const.h   | 20 ++++++++++++++++++++
 include/asm-x86_64/page.h    | 28 ++++++++++------------------
 include/asm-x86_64/pgtable.h | 33 +++++++++++++++++++++------------
 4 files changed, 52 insertions(+), 30 deletions(-)
 create mode 100644 include/asm-x86_64/const.h

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
index ebd7117782a6..242296ede3dd 100644
--- a/include/asm-x86_64/Kbuild
+++ b/include/asm-x86_64/Kbuild
@@ -18,3 +18,4 @@ header-y += vsyscall32.h
 unifdef-y += mce.h
 unifdef-y += mtrr.h
 unifdef-y += vsyscall.h
+unifdef-y += const.h
diff --git a/include/asm-x86_64/const.h b/include/asm-x86_64/const.h
new file mode 100644
index 000000000000..54fb08f3db9b
--- /dev/null
+++ b/include/asm-x86_64/const.h
@@ -0,0 +1,20 @@
+/* const.h: Macros for dealing with constants.  */
+
+#ifndef _X86_64_CONST_H
+#define _X86_64_CONST_H
+
+/* Some constant macros are used in both assembler and
+ * C code.  Therefore we cannot annotate them always with
+ * 'UL' and other type specificers unilaterally.  We
+ * use the following macros to deal with this.
+ */
+
+#ifdef __ASSEMBLY__
+#define _AC(X,Y)	X
+#else
+#define __AC(X,Y)	(X##Y)
+#define _AC(X,Y)	__AC(X,Y)
+#endif
+
+
+#endif /* !(_X86_64_CONST_H) */
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 10f346165cab..d554b94485df 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -1,14 +1,11 @@
 #ifndef _X86_64_PAGE_H
 #define _X86_64_PAGE_H
 
+#include <asm/const.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
-#ifdef __ASSEMBLY__
-#define PAGE_SIZE	(0x1 << PAGE_SHIFT)
-#else
-#define PAGE_SIZE	(1UL << PAGE_SHIFT)
-#endif
+#define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 #define PHYSICAL_PAGE_MASK	(~(PAGE_SIZE-1) & __PHYSICAL_MASK)
 
@@ -33,10 +30,10 @@
 #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
 
 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
-#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
+#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
 
 #define HPAGE_SHIFT PMD_SHIFT
-#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
+#define HPAGE_SIZE	(_AC(1,UL) << HPAGE_SHIFT)
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
@@ -76,29 +73,24 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define __pgd(x) ((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-#define __PHYSICAL_START	((unsigned long)CONFIG_PHYSICAL_START)
-#define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
-#define __START_KERNEL_map	0xffffffff80000000UL
-#define __PAGE_OFFSET           0xffff810000000000UL
+#endif /* !__ASSEMBLY__ */
 
-#else
 #define __PHYSICAL_START	CONFIG_PHYSICAL_START
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	0xffffffff80000000
 #define __PAGE_OFFSET           0xffff810000000000
-#endif /* !__ASSEMBLY__ */
 
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
 /* See Documentation/x86_64/mm.txt for a description of the memory map. */
 #define __PHYSICAL_MASK_SHIFT	46
-#define __PHYSICAL_MASK		((1UL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK		((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
 #define __VIRTUAL_MASK_SHIFT	48
-#define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+#define __VIRTUAL_MASK		((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
 
-#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
-#define KERNEL_TEXT_START 0xffffffff80000000UL 
+#define KERNEL_TEXT_SIZE  (40*1024*1024)
+#define KERNEL_TEXT_START 0xffffffff80000000
 
 #ifndef __ASSEMBLY__
 
@@ -106,7 +98,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 #endif /* __ASSEMBLY__ */
 
-#define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
+#define PAGE_OFFSET		__PAGE_OFFSET
 
 /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
    Otherwise you risk miscompilation. */ 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 5957361782fe..c514deb658a3 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -1,6 +1,9 @@
 #ifndef _X86_64_PGTABLE_H
 #define _X86_64_PGTABLE_H
 
+#include <asm/const.h>
+#ifndef __ASSEMBLY__
+
 /*
  * This file contains the functions and defines necessary to modify and use
  * the x86-64 page table tree.
@@ -30,6 +33,8 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
+#endif /* !__ASSEMBLY__ */
+
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
@@ -54,6 +59,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
  */
 #define PTRS_PER_PTE	512
 
+#ifndef __ASSEMBLY__
+
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
 #define pmd_ERROR(e) \
@@ -117,22 +124,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 
 #define pte_pgprot(a)	(__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
 
-#define PMD_SIZE	(1UL << PMD_SHIFT)
+#endif /* !__ASSEMBLY__ */
+
+#define PMD_SIZE	(_AC(1,UL) << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_SIZE	(_AC(1,UL) << PUD_SHIFT)
 #define PUD_MASK	(~(PUD_SIZE-1))
-#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_SIZE	(_AC(1,UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
 #define FIRST_USER_ADDRESS	0
 
-#ifndef __ASSEMBLY__
-#define MAXMEM		 0x3fffffffffffUL
-#define VMALLOC_START    0xffffc20000000000UL
-#define VMALLOC_END      0xffffe1ffffffffffUL
-#define MODULES_VADDR    0xffffffff88000000UL
-#define MODULES_END      0xfffffffffff00000UL
+#define MAXMEM		 0x3fffffffffff
+#define VMALLOC_START    0xffffc20000000000
+#define VMALLOC_END      0xffffe1ffffffffff
+#define MODULES_VADDR    0xffffffff88000000
+#define MODULES_END      0xfffffffffff00000
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
 #define _PAGE_BIT_PRESENT	0
@@ -158,7 +166,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_GLOBAL	0x100	/* Global TLB entry */
 
 #define _PAGE_PROTNONE	0x080	/* If not present */
-#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
+#define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -220,6 +228,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 
+#ifndef __ASSEMBLY__
+
 static inline unsigned long pgd_bad(pgd_t pgd)
 {
 	return pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
@@ -405,8 +415,6 @@ extern spinlock_t pgd_lock;
 extern struct page *pgd_list;
 void vmalloc_sync_all(void);
 
-#endif /* !__ASSEMBLY__ */
-
 extern int kern_addr_valid(unsigned long addr); 
 
 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
@@ -436,5 +444,6 @@ extern int kern_addr_valid(unsigned long addr);
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
 #include <asm-generic/pgtable.h>
+#endif /* !__ASSEMBLY__ */
 
 #endif /* _X86_64_PGTABLE_H */
-- 
cgit v1.2.3


From 67dcbb6bc6537aea92a2466bfc75f015b00e465e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: Clean up the early boot page table

- Merge physmem_pgt and ident_pgt, removing physmem_pgt.  The merge
  is broken as soon as mm/init.c:init_memory_mapping is run.
- As physmem_pgt is gone don't export it in pgtable.h.
- Use defines from pgtable.h for page permissions.
- Fix the physical memory identity mapping so it is at the correct
  address.
- Remove the physical memory mapping from wakeup_level4_pgt it
  is at the wrong address so we can't possibly be usinging it.
- Simply NEXT_PAGE the work to calculate the phys_ alias
  of the labels was very cool.  Unfortuantely it was a brittle
  special purpose hack that makes maitenance more difficult.
  Instead just use label - __START_KERNEL_map like we do
  everywhere else in assembly.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S    | 61 ++++++++++++++++++++------------------------
 include/asm-x86_64/pgtable.h |  1 -
 2 files changed, 28 insertions(+), 34 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 118c6088198a..23b45e79a27c 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <asm/desc.h>
 #include <asm/segment.h>
+#include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
@@ -260,52 +261,48 @@ ljumpvector:
 ENTRY(stext)
 ENTRY(_stext)
 
-	$page = 0
 #define NEXT_PAGE(name) \
-	$page = $page + 1; \
-	.org $page * 0x1000; \
-	phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
+	.balign	PAGE_SIZE; \
 ENTRY(name)
 
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT)		\
+	i = 0 ;					\
+	.rept (COUNT) ;				\
+	.quad	(START) + (i << 21) + (PERM) ;	\
+	i = i + 1 ;				\
+	.endr
+
 NEXT_PAGE(init_level4_pgt)
 	/* This gets initialized in x86_64_start_kernel */
 	.fill	512,8,0
 
 NEXT_PAGE(level3_ident_pgt)
-	.quad	phys_level2_ident_pgt | 0x007
+	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.fill	511,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	510,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	phys_level2_kernel_pgt | 0x007
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.fill	1,8,0
 
 NEXT_PAGE(level2_ident_pgt)
-	/* 40MB for bootup. 	*/
-	i = 0
-	.rept 20
-	.quad	i << 21 | 0x083
-	i = i + 1
-	.endr
-	.fill	492,8,0
+	/* Since I easily can, map the first 1G.
+	 * Don't set NX because code runs from these pages.
+	 */
+	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
 	
 NEXT_PAGE(level2_kernel_pgt)
 	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
 	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
-	i = 0
-	.rept 20
-	.quad	i << 21 | 0x183
-	i = i + 1
-	.endr
+	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+		KERNEL_TEXT_SIZE/PMD_SIZE)
 	/* Module mapping starts here */
-	.fill	492,8,0
-
-NEXT_PAGE(level3_physmem_pgt)
-	.quad	phys_level2_kernel_pgt | 0x007	/* so that __va works even before pagetable_init */
-	.fill	511,8,0
+	.fill	(PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
 
+#undef PMDS
 #undef NEXT_PAGE
 
 	.data
@@ -313,12 +310,10 @@ NEXT_PAGE(level3_physmem_pgt)
 #ifdef CONFIG_ACPI_SLEEP
 	.align PAGE_SIZE
 ENTRY(wakeup_level4_pgt)
-	.quad	phys_level3_ident_pgt | 0x007
-	.fill	255,8,0
-	.quad	phys_level3_physmem_pgt | 0x007
-	.fill	254,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	510,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	phys_level3_kernel_pgt | 0x007
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
 #endif
 
 #ifndef CONFIG_HOTPLUG_CPU
@@ -332,12 +327,12 @@ ENTRY(wakeup_level4_pgt)
 	 */
 	.align PAGE_SIZE
 ENTRY(boot_level4_pgt)
-	.quad	phys_level3_ident_pgt | 0x007
-	.fill	255,8,0
-	.quad	phys_level3_physmem_pgt | 0x007
-	.fill	254,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	257,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	252,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	phys_level3_kernel_pgt | 0x007
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
 	.data
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index c514deb658a3..5a5d43b3ef5d 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -14,7 +14,6 @@
 #include <asm/pda.h>
 
 extern pud_t level3_kernel_pgt[512];
-extern pud_t level3_physmem_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
-- 
cgit v1.2.3


From 30f472895401fbe8e64f861a2569bc9acb098741 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: cleanup segments

Move __KERNEL32_CS up into the unused gdt entry.  __KERNEL32_CS is
used when entering the kernel so putting it first is useful when
trying to keep boot gdt sizes to a minimum.

Set the accessed bit on all gdt entries.  We don't care
so there is no need for the cpu to burn the extra cycles,
and it potentially allows the pages to be immutable.  Plus
it is confusing when debugging and your gdt entries mysteriously
change.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S    | 12 ++++++------
 include/asm-x86_64/segment.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 23b45e79a27c..2b2e2c51e532 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -362,13 +362,13 @@ gdt:
 	
 ENTRY(cpu_gdt_table)
 	.quad	0x0000000000000000	/* NULL descriptor */
+	.quad	0x00cf9b000000ffff	/* __KERNEL32_CS */
+	.quad	0x00af9b000000ffff	/* __KERNEL_CS */
+	.quad	0x00cf93000000ffff	/* __KERNEL_DS */
+	.quad	0x00cffb000000ffff	/* __USER32_CS */
+	.quad	0x00cff3000000ffff	/* __USER_DS, __USER32_DS  */
+	.quad	0x00affb000000ffff	/* __USER_CS */
 	.quad	0x0			/* unused */
-	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
-	.quad	0x00cffa000000ffff	/* __USER32_CS */
-	.quad	0x00cff2000000ffff	/* __USER_DS, __USER32_DS  */		
-	.quad	0x00affa000000ffff	/* __USER_CS */
-	.quad	0x00cf9a000000ffff	/* __KERNEL32_CS */
 	.quad	0,0			/* TSS */
 	.quad	0,0			/* LDT */
 	.quad   0,0,0			/* three TLS descriptors */ 
diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h
index 334ddcdd8f92..adf2bf1e187c 100644
--- a/include/asm-x86_64/segment.h
+++ b/include/asm-x86_64/segment.h
@@ -6,7 +6,7 @@
 #define __KERNEL_CS	0x10
 #define __KERNEL_DS	0x18
 
-#define __KERNEL32_CS   0x38
+#define __KERNEL32_CS   0x08
 
 /* 
  * we cannot use the same code segment descriptor for user and kernel
-- 
cgit v1.2.3


From 3c321bceb4a626639ab43a5a24d884930e511826 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: Add EFER to the register set saved by
 save_processor_state

EFER varies like %cr4 depending on the cpu capabilities, and which cpu
capabilities we want to make use of.  So save/restore it make certain
we have the same EFER value when we are done.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/suspend.c | 3 ++-
 include/asm-x86_64/suspend.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 91f7e678bae7..fe865ea4df52 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -33,7 +33,6 @@ void __save_processor_state(struct saved_context *ctxt)
 	asm volatile ("str %0"  : "=m" (ctxt->tr));
 
 	/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
-	/* EFER should be constant for kernel version, no need to handle it. */
 	/*
 	 * segment registers
 	 */
@@ -50,6 +49,7 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers 
 	 */
+	rdmsrl(MSR_EFER, ctxt->efer);
 	asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
 	asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
 	asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
@@ -75,6 +75,7 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
+	wrmsrl(MSR_EFER, ctxt->efer);
 	asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
 	asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
 	asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h
index bc7f81715e5e..a42306c220d6 100644
--- a/include/asm-x86_64/suspend.h
+++ b/include/asm-x86_64/suspend.h
@@ -17,6 +17,7 @@ struct saved_context {
   	u16 ds, es, fs, gs, ss;
 	unsigned long gs_base, gs_kernel_base, fs_base;
 	unsigned long cr0, cr2, cr3, cr4, cr8;
+	unsigned long efer;
 	u16 gdt_pad;
 	u16 gdt_limit;
 	unsigned long gdt_base;
-- 
cgit v1.2.3


From 7db681d7e4038ad205b5face5cf7f7815633e1b5 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: wakeup.S rename registers to reflect right names

o Use appropriate names for 64bit regsiters.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/acpi/wakeup.S | 36 ++++++++++++++++++------------------
 include/asm-x86_64/suspend.h     | 12 ++++++------
 2 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index 6ece70e91a32..17dbeff64eef 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -211,16 +211,16 @@ wakeup_long64:
 	movw	%ax, %es
 	movw	%ax, %fs
 	movw	%ax, %gs
-	movq	saved_esp, %rsp
+	movq	saved_rsp, %rsp
 
 	movw	$0x0e00 + 'x', %ds:(0xb8018)
-	movq	saved_ebx, %rbx
-	movq	saved_edi, %rdi
-	movq	saved_esi, %rsi
-	movq	saved_ebp, %rbp
+	movq	saved_rbx, %rbx
+	movq	saved_rdi, %rdi
+	movq	saved_rsi, %rsi
+	movq	saved_rbp, %rbp
 
 	movw	$0x0e00 + '!', %ds:(0xb801a)
-	movq	saved_eip, %rax
+	movq	saved_rip, %rax
 	jmp	*%rax
 
 .code32
@@ -408,13 +408,13 @@ do_suspend_lowlevel:
 	movq %r15, saved_context_r15(%rip)
 	pushfq ; popq saved_context_eflags(%rip)
 
-	movq	$.L97, saved_eip(%rip)
+	movq	$.L97, saved_rip(%rip)
 
-	movq %rsp,saved_esp
-	movq %rbp,saved_ebp
-	movq %rbx,saved_ebx
-	movq %rdi,saved_edi
-	movq %rsi,saved_esi
+	movq %rsp,saved_rsp
+	movq %rbp,saved_rbp
+	movq %rbx,saved_rbx
+	movq %rdi,saved_rdi
+	movq %rsi,saved_rsi
 
 	addq	$8, %rsp
 	movl	$3, %edi
@@ -461,12 +461,12 @@ do_suspend_lowlevel:
 	
 .data
 ALIGN
-ENTRY(saved_ebp)	.quad	0
-ENTRY(saved_esi)	.quad	0
-ENTRY(saved_edi)	.quad	0
-ENTRY(saved_ebx)	.quad	0
+ENTRY(saved_rbp)	.quad	0
+ENTRY(saved_rsi)	.quad	0
+ENTRY(saved_rdi)	.quad	0
+ENTRY(saved_rbx)	.quad	0
 
-ENTRY(saved_eip)	.quad	0
-ENTRY(saved_esp)	.quad	0
+ENTRY(saved_rip)	.quad	0
+ENTRY(saved_rsp)	.quad	0
 
 ENTRY(saved_magic)	.quad	0
diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h
index a42306c220d6..9c3f8de90d2d 100644
--- a/include/asm-x86_64/suspend.h
+++ b/include/asm-x86_64/suspend.h
@@ -45,12 +45,12 @@ extern unsigned long saved_context_eflags;
 extern void fix_processor_context(void);
 
 #ifdef CONFIG_ACPI_SLEEP
-extern unsigned long saved_eip;
-extern unsigned long saved_esp;
-extern unsigned long saved_ebp;
-extern unsigned long saved_ebx;
-extern unsigned long saved_esi;
-extern unsigned long saved_edi;
+extern unsigned long saved_rip;
+extern unsigned long saved_rsp;
+extern unsigned long saved_rbp;
+extern unsigned long saved_rbx;
+extern unsigned long saved_rsi;
+extern unsigned long saved_rdi;
 
 /* routines for saving/restoring kernel state */
 extern int acpi_save_state_mem(void);
-- 
cgit v1.2.3


From cfd243d4af7c7f8f52f5cb99d3932d9074b039ff Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: Remove the identity mapping as early as possible

With the rewrite of the SMP trampoline and the early page
allocator there is nothing that needs identity mapped pages,
once we start executing C code.

So add zap_identity_mappings into head64.c and remove
zap_low_mappings() from much later in the code.  The functions
 are subtly different thus the name change.

This also kills boot_level4_pgt which was from an earlier
attempt to move the identity mappings as early as possible,
and is now no longer needed.  Essentially I have replaced
boot_level4_pgt with trampoline_level4_pgt in trampoline.S

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S    | 39 ++++++++++++++-------------------------
 arch/x86_64/kernel/head64.c  | 17 +++++++++++------
 arch/x86_64/kernel/setup.c   |  2 --
 arch/x86_64/kernel/setup64.c |  1 -
 arch/x86_64/mm/init.c        | 24 ------------------------
 include/asm-x86_64/pgtable.h |  1 -
 include/asm-x86_64/proto.h   |  2 --
 7 files changed, 25 insertions(+), 61 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 926aa2197aaa..c211e52f1333 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -71,7 +71,7 @@ startup_32:
 	movl	%eax, %cr4
 
 	/* Setup early boot stage 4 level pagetables */
-	movl	$(boot_level4_pgt - __START_KERNEL_map), %eax
+	movl	$(init_level4_pgt - __START_KERNEL_map), %eax
 	movl	%eax, %cr3
 
 	/* Setup EFER (Extended Feature Enable Register) */
@@ -115,7 +115,7 @@ ENTRY(secondary_startup_64)
 	movq	%rax, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
-	movq	$(boot_level4_pgt - __START_KERNEL_map), %rax
+	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 	movq	%rax, %cr3
 
 	/* Check if nx is implemented */
@@ -274,9 +274,19 @@ ENTRY(name)
 	i = i + 1 ;				\
 	.endr
 
+	/*
+	 * This default setting generates an ident mapping at address 0x100000
+	 * and a mapping for the kernel that precisely maps virtual address
+	 * 0xffffffff80000000 to physical address 0x000000. (always using
+	 * 2Mbyte large pages provided by PAE mode)
+	 */
 NEXT_PAGE(init_level4_pgt)
-	/* This gets initialized in x86_64_start_kernel */
-	.fill	512,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	257,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	252,8,0
+	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -307,27 +317,6 @@ NEXT_PAGE(level2_kernel_pgt)
 #undef NEXT_PAGE
 
 	.data
-
-#ifndef CONFIG_HOTPLUG_CPU
-	__INITDATA
-#endif
-	/*
-	 * This default setting generates an ident mapping at address 0x100000
-	 * and a mapping for the kernel that precisely maps virtual address
-	 * 0xffffffff80000000 to physical address 0x000000. (always using
-	 * 2Mbyte large pages provided by PAE mode)
-	 */
-	.align PAGE_SIZE
-ENTRY(boot_level4_pgt)
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	257,8,0
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	252,8,0
-	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
-
-	.data
-
 	.align 16
 	.globl cpu_gdt_descr
 cpu_gdt_descr:
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 5c529c1e3d6c..6c34bdd22e26 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -18,8 +18,16 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 #include <asm/sections.h>
 
+static void __init zap_identity_mappings(void)
+{
+	pgd_t *pgd = pgd_offset_k(0UL);
+	pgd_clear(pgd);
+	__flush_tlb();
+}
+
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
@@ -57,18 +65,15 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
+	/* Make NULL pointers segfault */
+	zap_identity_mappings();
+
 	for (i = 0; i < IDT_ENTRIES; i++)
 		set_intr_gate(i, early_idt_handler);
 	asm volatile("lidt %0" :: "m" (idt_descr));
 
 	early_printk("Kernel alive\n");
 
-	/*
-	 * switch to init_level4_pgt from boot_level4_pgt
-	 */
-	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
-	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-
  	for (i = 0; i < NR_CPUS; i++)
  		cpu_pda(i) = &boot_cpu_pda[i];
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 65e2bc551a2b..0e2b8df0ea64 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -274,8 +274,6 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
-	zap_low_mappings(0);
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 6a70b55f719d..53064a9a365f 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -201,7 +201,6 @@ void __cpuinit cpu_init (void)
 	/* CPU 0 is initialised in head64.c */
 	if (cpu != 0) {
 		pda_init(cpu);
-		zap_low_mappings(cpu);
 	} else 
 		estacks = boot_exception_stacks; 
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4ab3d40aac90..b0a607892183 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -378,21 +378,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 	__flush_tlb_all();
 }
 
-void __cpuinit zap_low_mappings(int cpu)
-{
-	if (cpu == 0) {
-		pgd_t *pgd = pgd_offset_k(0UL);
-		pgd_clear(pgd);
-	} else {
-		/*
-		 * For AP's, zap the low identity mappings by changing the cr3
-		 * to init_level4_pgt and doing local flush tlb all
-		 */
-		asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-	}
-	__flush_tlb_all();
-}
-
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
@@ -569,15 +554,6 @@ void __init mem_init(void)
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		initsize >> 10);
-
-#ifdef CONFIG_SMP
-	/*
-	 * Sync boot_level4_pgt mappings with the init_level4_pgt
-	 * except for the low identity mappings which are already zapped
-	 * in init_level4_pgt. This sync-up is essential for AP's bringup
-	 */
-	memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
-#endif
 }
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 5a5d43b3ef5d..703f0243f27a 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -17,7 +17,6 @@ extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
-extern pgd_t boot_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
 #define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 78427021d94a..3f8f285138d2 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -11,8 +11,6 @@ struct pt_regs;
 extern void start_kernel(void);
 extern void pda_init(int); 
 
-extern void zap_low_mappings(int cpu);
-
 extern void early_idt_handler(void);
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
-- 
cgit v1.2.3


From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86: __pa and __pa_symbol address space separation

Currently __pa_symbol is for use with symbols in the kernel address
map and __pa is for use with pointers into the physical memory map.
But the code is implemented so you can usually interchange the two.

__pa which is much more common can be implemented much more cheaply
if it is it doesn't have to worry about any other kernel address
spaces.  This is especially true with a relocatable kernel as
__pa_symbol needs to peform an extra variable read to resolve
the address.

There is a third macro that is added for the vsyscall data
__pa_vsymbol for finding the physical addesses of vsyscall pages.

Most of this patch is simply sorting through the references to
__pa or __pa_symbol and using the proper one.  A little of
it is continuing to use a physical address when we have it
instead of recalculating it several times.

swapper_pgd is now NULL.  leave_mm now uses init_mm.pgd
and init_mm.pgd is initialized at boot (instead of compile time)
to the physmem virtual mapping of init_level4_pgd.  The
physical address changed.

Except for the for EMPTY_ZERO page all of the remaining references
to __pa_symbol appear to be during kernel initialization.  So this
should reduce the cost of __pa in the common case, even on a relocated
kernel.

As this is technically a semantic change we need to be on the lookout
for anything I missed.  But it works for me (tm).

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/alternative.c     |  4 ++--
 arch/i386/mm/init.c                | 15 ++++++++-------
 arch/x86_64/kernel/machine_kexec.c | 14 +++++++-------
 arch/x86_64/kernel/setup.c         |  9 +++++----
 arch/x86_64/kernel/smp.c           |  2 +-
 arch/x86_64/kernel/vsyscall.c      |  9 +++++++--
 arch/x86_64/mm/init.c              | 21 +++++++++++----------
 arch/x86_64/mm/pageattr.c          | 16 ++++++++--------
 include/asm-x86_64/page.h          |  6 ++----
 include/asm-x86_64/pgtable.h       |  4 ++--
 10 files changed, 53 insertions(+), 47 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106b..a27c8d347364 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -402,8 +402,8 @@ void __init alternative_instructions(void)
 						_text, _etext);
 		}
 		free_init_pages("SMP alternatives",
-				(unsigned long)__smp_alt_begin,
-				(unsigned long)__smp_alt_end);
+				__pa_symbol(&__smp_alt_begin),
+				__pa_symbol(&__smp_alt_end));
 	} else {
 		alternatives_smp_save(__smp_alt_instructions,
 				      __smp_alt_instructions_end);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index ae436882af7a..23be1b0aafa4 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
+		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+		ClearPageReserved(page);
+		init_page_count(page);
+		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		__free_page(page);
 		totalram_pages++;
 	}
 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
@@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 void free_initmem(void)
 {
 	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
+			__pa_symbol(&__init_begin),
+			__pa_symbol(&__init_end));
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	free_init_pages("initrd memory", __pa(start), __pa(end));
 }
 #endif
 
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 0497e3bd5bff..a8bb33c1a8f2 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[PA_PGD] = __pa_symbol(&kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
-	page_list[PA_PUD_0] = __pa(kexec_pud0);
+	page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0);
 	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
-	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0);
 	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0);
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-	page_list[PA_PUD_1] = __pa(kexec_pud1);
+	page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1);
 	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
-	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1);
 	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	page_list[PA_TABLE_PAGE] =
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0e2b8df0ea64..b9bdfc1b54f8 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -243,11 +243,12 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) &_etext;
 	init_mm.end_data = (unsigned long) &_edata;
 	init_mm.brk = (unsigned long) &_end;
+	init_mm.pgd = __va(__pa_symbol(&init_level4_pgt));
 
-	code_resource.start = virt_to_phys(&_text);
-	code_resource.end = virt_to_phys(&_etext)-1;
-	data_resource.start = virt_to_phys(&_etext);
-	data_resource.end = virt_to_phys(&_edata)-1;
+	code_resource.start = __pa_symbol(&_text);
+	code_resource.end = __pa_symbol(&_etext)-1;
+	data_resource.start = __pa_symbol(&_etext);
+	data_resource.end = __pa_symbol(&_edata)-1;
 
 	early_identify_cpu(&boot_cpu_data);
 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index bd1d123947ce..22abae4e9f39 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -76,7 +76,7 @@ static inline void leave_mm(int cpu)
 	if (read_pda(mmu_state) == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-	load_cr3(swapper_pg_dir);
+	load_cr3(init_mm.pgd);
 }
 
 /*
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b43c698cf7d3..d14cbb3e0ebe 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -45,6 +45,11 @@
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","rcx","memory"
+#define __pa_vsymbol(x)			\
+	({unsigned long v;  		\
+	extern char __vsyscall_0; 	\
+	  asm("" : "=r" (v) : "0" (x)); \
+	  ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
 
 struct vsyscall_gtod_data_t {
 	seqlock_t lock;
@@ -224,10 +229,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
 		return ret;
 	/* gcc has some trouble with __va(__pa()), so just do it this
 	   way. */
-	map1 = ioremap(__pa_symbol(&vsysc1), 2);
+	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
 	if (!map1)
 		return -ENOMEM;
-	map2 = ioremap(__pa_symbol(&vsysc2), 2);
+	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
 	if (!map2) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b0a607892183..69e22d3c9238 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -565,11 +565,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)),
-			POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
+		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+		ClearPageReserved(page);
+		init_page_count(page);
+		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		__free_page(page);
 		totalram_pages++;
 	}
 }
@@ -579,17 +579,18 @@ void free_initmem(void)
 	memset(__initdata_begin, POISON_FREE_INITDATA,
 		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
+			__pa_symbol(&__init_begin),
+			__pa_symbol(&__init_end));
 }
 
 #ifdef CONFIG_DEBUG_RODATA
 
 void mark_rodata_ro(void)
 {
-	unsigned long addr = (unsigned long)__start_rodata;
+	unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata));
+	unsigned long end  = (unsigned long)__va(__pa_symbol(&__end_rodata));
 
-	for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
+	for (; addr < end; addr += PAGE_SIZE)
 		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
 
 	printk ("Write protecting the kernel read-only data: %luk\n",
@@ -608,7 +609,7 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	free_init_pages("initrd memory", __pa(start), __pa(end));
 }
 #endif
 
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 081409aa3452..76ee90a5abe0 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	SetPagePrivate(base);
 	page_private(base) = 0;
 
-	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
@@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage)
  * No more special protections in this 2/4MB area - revert to a
  * large page again. 
  */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
+static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t large_pte;
-	unsigned long pfn;
 
 	pgd = pgd_offset_k(address);
 	BUG_ON(pgd_none(*pgd));
@@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
 	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
-	pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
 	large_pte = pfn_pte(pfn, ref_prot);
 	large_pte = pte_mkhuge(large_pte);
 	set_pte((pte_t *)pmd, large_pte);
@@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  			 */
 			struct page *split;
 			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(address, prot, ref_prot2);
+			split = split_large_page(pfn << PAGE_SHIFT, prot,
+							ref_prot2);
 			if (!split)
 				return -ENOMEM;
 			set_pte(kpte, mk_pte(split, ref_prot2));
@@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 
 	if (page_private(kpte_page) == 0) {
 		save_page(kpte_page);
-		revert_page(address, ref_prot);
+		revert_page(address, pfn, ref_prot);
  	}
 	return 0;
 } 
@@ -180,6 +178,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  */
 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 {
+	unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
 	int err = 0; 
 	int i; 
 
@@ -192,10 +191,11 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 			break; 
 		/* Handle kernel mapping too which aliases part of the
 		 * lowmem */
-		if (__pa(address) < KERNEL_TEXT_SIZE) {
+		if ((pfn >= phys_base_pfn) &&
+			((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) {
 			unsigned long addr2;
 			pgprot_t prot2;
-			addr2 = __START_KERNEL_map + __pa(address);
+			addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT);
 			/* Make sure the kernel mappings stay executable */
 			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
 			err = __change_page_attr(addr2, pfn, prot2,
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index d554b94485df..4974433bbf34 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -102,17 +102,15 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
    Otherwise you risk miscompilation. */ 
-#define __pa(x)			(((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
+#define __pa(x)			((unsigned long)(x) - PAGE_OFFSET)
 /* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */ 
 #define __pa_symbol(x)		\
 	({unsigned long v;  \
 	  asm("" : "=r" (v) : "0" (x)); \
-	  __pa(v); })
+	  (v - __START_KERNEL_map); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#define __boot_va(x)		__va(x)
-#define __boot_pa(x)		__pa(x)
 #ifdef CONFIG_FLATMEM
 #define pfn_valid(pfn)		((pfn) < end_pfn)
 #endif
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 703f0243f27a..c1865e38c7b7 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir ((pgd_t *)NULL)
 
 extern void paging_init(void);
 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
@@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
  * for zero-mapped memory areas etc..
  */
 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT))
 
 #endif /* !__ASSEMBLY__ */
 
-- 
cgit v1.2.3


From 1ab60e0f72f71ec54831e525a3e1154f1c092408 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: Relocatable Kernel Support

This patch modifies the x86_64 kernel so that it can be loaded and run
at any 2M aligned address, below 512G.  The technique used is to
compile the decompressor with -fPIC and modify it so the decompressor
is fully relocatable.  For the main kernel the page tables are
modified so the kernel remains at the same virtual address.  In
addition a variable phys_base is kept that holds the physical address
the kernel is loaded at.  __pa_symbol is modified to add that when
we take the address of a kernel symbol.

When loaded with a normal bootloader the decompressor will decompress
the kernel to 2M and it will run there.  This both ensures the
relocation code is always working, and makes it easier to use 2M
pages for the kernel and the cpu.

AK: changed to not make RELOCATABLE default in Kconfig

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/Kconfig                     |  49 ++++-
 arch/x86_64/boot/compressed/Makefile    |  12 +-
 arch/x86_64/boot/compressed/head.S      | 322 +++++++++++++++++++++++---------
 arch/x86_64/boot/compressed/misc.c      | 247 ++++++++++++------------
 arch/x86_64/boot/compressed/vmlinux.lds |  44 +++++
 arch/x86_64/boot/compressed/vmlinux.scr |   9 +-
 arch/x86_64/kernel/head.S               | 233 +++++++++++++----------
 arch/x86_64/kernel/suspend_asm.S        |   7 +-
 include/asm-x86_64/page.h               |   6 +-
 9 files changed, 597 insertions(+), 332 deletions(-)
 create mode 100644 arch/x86_64/boot/compressed/vmlinux.lds

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index b3dbf11eb82c..715632026073 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -565,23 +565,56 @@ config CRASH_DUMP
 	  PHYSICAL_START.
           For more details see Documentation/kdump/kdump.txt
 
+config RELOCATABLE
+	bool "Build a relocatable kernel(EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Builds a relocatable kernel. This enables loading and running
+	  a kernel binary from a different physical address than it has
+	  been compiled for.
+
+	  One use is for the kexec on panic case where the recovery kernel
+	  must live at a different physical address than the primary
+	  kernel.
+
+	  Note: If CONFIG_RELOCATABLE=y, then kernel run from the address
+	  it has been loaded at and compile time physical address
+	  (CONFIG_PHYSICAL_START) is ignored.
+
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-	default "0x1000000" if CRASH_DUMP
 	default "0x200000"
 	help
-	  This gives the physical address where the kernel is loaded. Normally
-	  for regular kernels this value is 0x200000 (2MB). But in the case
-	  of kexec on panic the fail safe kernel needs to run at a different
-	  address than the panic-ed kernel. This option is used to set the load
-	  address for kernels used to capture crash dump on being kexec'ed
-	  after panic. The default value for crash dump kernels is
-	  0x1000000 (16MB). This can also be set based on the "X" value as
+	  This gives the physical address where the kernel is loaded. It
+	  should be aligned to 2MB boundary.
+
+	  If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
+	  bzImage will decompress itself to above physical address and
+	  run from there. Otherwise, bzImage will run from the address where
+	  it has been loaded by the boot loader and will ignore above physical
+	  address.
+
+	  In normal kdump cases one does not have to set/change this option
+	  as now bzImage can be compiled as a completely relocatable image
+	  (CONFIG_RELOCATABLE=y) and be used to load and run from a different
+	  address. This option is mainly useful for the folks who don't want
+	  to use a bzImage for capturing the crash dump and want to use a
+	  vmlinux instead.
+
+	  So if you are using bzImage for capturing the crash dump, leave
+	  the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y.
+	  Otherwise if you plan to use vmlinux for capturing the crash dump
+	  change this value to start of the reserved region (Typically 16MB
+	  0x1000000). In other words, it can be set based on the "X" value as
 	  specified in the "crashkernel=YM@XM" command line boot parameter
 	  passed to the panic-ed kernel. Typically this parameter is set as
 	  crashkernel=64M@16M. Please take a look at
 	  Documentation/kdump/kdump.txt for more details about crash dumps.
 
+	  Usage of bzImage for capturing the crash dump is advantageous as
+	  one does not have to build two kernels. Same kernel can be used
+	  as production kernel and capture kernel.
+
 	  Don't change this unless you know what you are doing.
 
 config SECCOMP
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index e70fa6e1da08..705a3e33d7e1 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -8,16 +8,14 @@
 
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
 EXTRA_AFLAGS	:= -traditional
-AFLAGS		:= $(subst -m64,-m32,$(AFLAGS))
 
 # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
 # -m32
-CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2  -fno-strict-aliasing
-LDFLAGS := -m elf_i386
+CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2  -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin
+LDFLAGS := -m elf_x86_64
 
-LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386
-
-$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
+LDFLAGS_vmlinux := -T
+$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
@@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,gzip)
 
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
+LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
 
 $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
 	$(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 6f55565e4d42..c353a9266ea4 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -26,116 +26,262 @@
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
+#include <asm/pgtable.h>
 #include <asm/page.h>
+#include <asm/msr.h>
 
+.section ".text.head"
 	.code32
 	.globl startup_32
-	
+
 startup_32:
 	cld
 	cli
-	movl $(__KERNEL_DS),%eax
-	movl %eax,%ds
-	movl %eax,%es
-	movl %eax,%fs
-	movl %eax,%gs
-
-	lss stack_start,%esp
-	xorl %eax,%eax
-1:	incl %eax		# check that A20 really IS enabled
-	movl %eax,0x000000	# loop forever if it isn't
-	cmpl %eax,0x100000
-	je 1b
+	movl	$(__KERNEL_DS), %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+/* Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at.  This can only be done
+ * with a short local call on x86.  Nothing  else will tell us what
+ * address we are running at.  The reserved chunk of the real-mode
+ * data at 0x34-0x3f are used as the stack for this calculation.
+ * Only 4 bytes are needed.
+ */
+	leal	0x40(%esi), %esp
+	call	1f
+1:	popl	%ebp
+	subl	$1b, %ebp
+
+/* Compute the delta between where we were compiled to run at
+ * and where the code will actually run at.
+ */
+/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
+ */
+
+#ifdef CONFIG_RELOCATABLE
+	movl	%ebp, %ebx
+	addl	$(LARGE_PAGE_SIZE -1), %ebx
+	andl	$LARGE_PAGE_MASK, %ebx
+#else
+	movl	$CONFIG_PHYSICAL_START, %ebx
+#endif
+
+	/* Replace the compressed data size with the uncompressed size */
+	subl	input_len(%ebp), %ebx
+	movl	output_len(%ebp), %eax
+	addl	%eax, %ebx
+	/* Add 8 bytes for every 32K input block */
+	shrl	$12, %eax
+	addl	%eax, %ebx
+	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
+	addl	$(32768 + 18 + 4095), %ebx
+	andl	$~4095, %ebx
 
 /*
- * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
+ * Prepare for entering 64 bit mode
  */
-	pushl $0
-	popfl
+
+	/* Load new GDT with the 64bit segments using 32bit descriptor */
+	leal	gdt(%ebp), %eax
+	movl	%eax, gdt+2(%ebp)
+	lgdt	gdt(%ebp)
+
+	/* Enable PAE mode */
+	xorl	%eax, %eax
+	orl	$(1 << 5), %eax
+	movl	%eax, %cr4
+
+ /*
+  * Build early 4G boot pagetable
+  */
+	/* Initialize Page tables to 0*/
+	leal	pgtable(%ebx), %edi
+	xorl	%eax, %eax
+	movl	$((4096*6)/4), %ecx
+	rep	stosl
+
+	/* Build Level 4 */
+	leal	pgtable + 0(%ebx), %edi
+	leal	0x1007 (%edi), %eax
+	movl	%eax, 0(%edi)
+
+	/* Build Level 3 */
+	leal	pgtable + 0x1000(%ebx), %edi
+	leal	0x1007(%edi), %eax
+	movl	$4, %ecx
+1:	movl	%eax, 0x00(%edi)
+	addl	$0x00001000, %eax
+	addl	$8, %edi
+	decl	%ecx
+	jnz	1b
+
+	/* Build Level 2 */
+	leal	pgtable + 0x2000(%ebx), %edi
+	movl	$0x00000183, %eax
+	movl	$2048, %ecx
+1:	movl	%eax, 0(%edi)
+	addl	$0x00200000, %eax
+	addl	$8, %edi
+	decl	%ecx
+	jnz	1b
+
+	/* Enable the boot page tables */
+	leal	pgtable(%ebx), %eax
+	movl	%eax, %cr3
+
+	/* Enable Long mode in EFER (Extended Feature Enable Register) */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	/* Setup for the jump to 64bit mode
+	 *
+	 * When the jump is performend we will be in long mode but
+	 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
+	 * (and in turn EFER.LMA = 1).	To jump into 64bit mode we use
+	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+	 * We place all of the values on our mini stack so lret can
+	 * used to perform that far jump.
+	 */
+	pushl	$__KERNEL_CS
+	leal	startup_64(%ebp), %eax
+	pushl	%eax
+
+	/* Enter paged protected Mode, activating Long Mode */
+	movl	$0x80000001, %eax /* Enable Paging and Protected mode */
+	movl	%eax, %cr0
+
+	/* Jump from 32bit compatibility mode into 64bit mode. */
+	lret
+
+	/* Be careful here startup_64 needs to be at a predictable
+	 * address so I can export it in an ELF header.  Bootloaders
+	 * should look at the ELF header to find this address, as
+	 * it may change in the future.
+	 */
+	.code64
+	.org 0x100
+ENTRY(startup_64)
+	/* We come here either from startup_32 or directly from a
+	 * 64bit bootloader.  If we come here from a bootloader we depend on
+	 * an identity mapped page table being provied that maps our
+	 * entire text+data+bss and hopefully all of memory.
+	 */
+
+	/* Setup data segments. */
+	xorl	%eax, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+	/* Compute the decompressed kernel start address.  It is where
+	 * we were loaded at aligned to a 2M boundary. %rbp contains the
+	 * decompressed kernel start address.
+	 *
+	 * If it is a relocatable kernel then decompress and run the kernel
+	 * from load address aligned to 2MB addr, otherwise decompress and
+	 * run the kernel from CONFIG_PHYSICAL_START
+	 */
+
+	/* Start with the delta to where the kernel will run at. */
+#ifdef CONFIG_RELOCATABLE
+	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
+	addq	$(LARGE_PAGE_SIZE - 1), %rbp
+	andq	$LARGE_PAGE_MASK, %rbp
+	movq	%rbp, %rbx
+#else
+	movq	$CONFIG_PHYSICAL_START, %rbp
+	movq	%rbp, %rbx
+#endif
+
+	/* Replace the compressed data size with the uncompressed size */
+	movl	input_len(%rip), %eax
+	subq	%rax, %rbx
+	movl	output_len(%rip), %eax
+	addq	%rax, %rbx
+	/* Add 8 bytes for every 32K input block */
+	shrq	$12, %rax
+	addq	%rax, %rbx
+	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
+	addq	$(32768 + 18 + 4095), %rbx
+	andq	$~4095, %rbx
+
+/* Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+	leaq	_end(%rip), %r8
+	leaq	_end(%rbx), %r9
+	movq	$_end /* - $startup_32 */, %rcx
+1:	subq	$8, %r8
+	subq	$8, %r9
+	movq	0(%r8), %rax
+	movq	%rax, 0(%r9)
+	subq	$8, %rcx
+	jnz	1b
+
+/*
+ * Jump to the relocated address.
+ */
+	leaq	relocated(%rbx), %rax
+	jmp	*%rax
+
+.section ".text"
+relocated:
+
 /*
  * Clear BSS
  */
-	xorl %eax,%eax
-	movl $_edata,%edi
-	movl $_end,%ecx
-	subl %edi,%ecx
+	xorq	%rax, %rax
+	leaq    _edata(%rbx), %rdi
+	leaq    _end(%rbx), %rcx
+	subq	%rdi, %rcx
 	cld
 	rep
 	stosb
+
+	/* Setup the stack */
+	leaq	user_stack_end(%rip), %rsp
+
+	/* zero EFLAGS after setting rsp */
+	pushq	$0
+	popfq
+
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	subl $16,%esp	# place for structure on the stack
-	movl %esp,%eax
-	pushl %esi	# real mode pointer as second arg
-	pushl %eax	# address of structure as first arg
-	call decompress_kernel
-	orl  %eax,%eax 
-	jnz  3f
-	addl $8,%esp
-	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $__PHYSICAL_START
+	pushq	%rsi			# Save the real mode argument
+	movq	%rsi, %rdi		# real mode address
+	leaq	_heap(%rip), %rsi	# _heap
+	leaq	input_data(%rip), %rdx  # input_data
+	movl	input_len(%rip), %eax
+	movq	%rax, %rcx		# input_len
+	movq	%rbp, %r8		# output
+	call	decompress_kernel
+	popq	%rsi
 
-/*
- * We come here, if we were loaded high.
- * We need to move the move-in-place routine down to 0x1000
- * and then start it with the buffer addresses in registers,
- * which we got from the stack.
- */
-3:
-	movl %esi,%ebx	
-	movl $move_routine_start,%esi
-	movl $0x1000,%edi
-	movl $move_routine_end,%ecx
-	subl %esi,%ecx
-	addl $3,%ecx
-	shrl $2,%ecx
-	cld
-	rep
-	movsl
-
-	popl %esi	# discard the address
-	addl $4,%esp	# real mode pointer
-	popl %esi	# low_buffer_start
-	popl %ecx	# lcount
-	popl %edx	# high_buffer_start
-	popl %eax	# hcount
-	movl $__PHYSICAL_START,%edi
-	cli		# make sure we don't get interrupted
-	ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
 
 /*
- * Routine (template) for moving the decompressed kernel in place,
- * if we were high loaded. This _must_ PIC-code !
+ * Jump to the decompressed kernel.
  */
-move_routine_start:
-	movl %ecx,%ebp
-	shrl $2,%ecx
-	rep
-	movsl
-	movl %ebp,%ecx
-	andl $3,%ecx
-	rep
-	movsb
-	movl %edx,%esi
-	movl %eax,%ecx	# NOTE: rep movsb won't move if %ecx == 0
-	addl $3,%ecx
-	shrl $2,%ecx
-	rep
-	movsl
-	movl %ebx,%esi	# Restore setup pointer
-	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $__PHYSICAL_START
-move_routine_end:
+	jmp	*%rbp
 
-
-/* Stack for uncompression */ 	
-	.align 32
-user_stack:	 	
+	.data
+gdt:
+	.word	gdt_end - gdt
+	.long	gdt
+	.word	0
+	.quad	0x0000000000000000	/* NULL descriptor */
+	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
+	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
+gdt_end:
+	.bss
+/* Stack for uncompression */
+	.balign 4
+user_stack:
 	.fill 4096,4,0
-stack_start:	
-	.long user_stack+4096
-	.word __KERNEL_DS
-
+user_stack_end:
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 3755b2e394d0..fee54dbf1749 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,10 +9,95 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+#define _LINUX_STRING_H_ 1
+#define __LINUX_BITMAP_H 1
+
+#include <linux/linkage.h>
 #include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
+/* WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically
+ * at run time, but no relocation processing is performed.
+ * This means that it is not safe to place pointers in static structures.
+ */
+
+/*
+ * Getting to provable safe in place decompression is hard.
+ * Worst case behaviours need to be analized.
+ * Background information:
+ *
+ * The file layout is:
+ *    magic[2]
+ *    method[1]
+ *    flags[1]
+ *    timestamp[4]
+ *    extraflags[1]
+ *    os[1]
+ *    compressed data blocks[N]
+ *    crc[4] orig_len[4]
+ *
+ * resulting in 18 bytes of non compressed data overhead.
+ *
+ * Files divided into blocks
+ * 1 bit (last block flag)
+ * 2 bits (block type)
+ *
+ * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
+ * The smallest block type encoding is always used.
+ *
+ * stored:
+ *    32 bits length in bytes.
+ *
+ * fixed:
+ *    magic fixed tree.
+ *    symbols.
+ *
+ * dynamic:
+ *    dynamic tree encoding.
+ *    symbols.
+ *
+ *
+ * The buffer for decompression in place is the length of the
+ * uncompressed data, plus a small amount extra to keep the algorithm safe.
+ * The compressed data is placed at the end of the buffer.  The output
+ * pointer is placed at the start of the buffer and the input pointer
+ * is placed where the compressed data starts.  Problems will occur
+ * when the output pointer overruns the input pointer.
+ *
+ * The output pointer can only overrun the input pointer if the input
+ * pointer is moving faster than the output pointer.  A condition only
+ * triggered by data whose compressed form is larger than the uncompressed
+ * form.
+ *
+ * The worst case at the block level is a growth of the compressed data
+ * of 5 bytes per 32767 bytes.
+ *
+ * The worst case internal to a compressed block is very hard to figure.
+ * The worst case can at least be boundined by having one bit that represents
+ * 32764 bytes and then all of the rest of the bytes representing the very
+ * very last byte.
+ *
+ * All of which is enough to compute an amount of extra data that is required
+ * to be safe.  To avoid problems at the block level allocating 5 extra bytes
+ * per 32767 bytes of data is sufficient.  To avoind problems internal to a block
+ * adding an extra 32767 bytes (the worst case uncompressed block size) is
+ * sufficient, to ensure that in the worst case the decompressed data for
+ * block will stop the byte before the compressed data for a block begins.
+ * To avoid problems with the compressed data's meta information an extra 18
+ * bytes are needed.  Leading to the formula:
+ *
+ * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
+ *
+ * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+ * Adding 32768 instead of 32767 just makes for round numbers.
+ * Adding the decompressor_size is necessary as it musht live after all
+ * of the data as well.  Last I measured the decompressor is about 14K.
+ * 10K of actuall data and 4K of bss.
+ *
+ */
+
 /*
  * gzip declarations
  */
@@ -28,15 +113,20 @@ typedef unsigned char  uch;
 typedef unsigned short ush;
 typedef unsigned long  ulg;
 
-#define WSIZE 0x8000		/* Window size must be at least 32k, */
-				/* and a power of two */
+#define WSIZE 0x80000000	/* Window size must be at least 32k,
+				 * and a power of two
+				 * We don't actually have a window just
+				 * a huge output buffer so I report
+				 * a 2G windows size, as that should
+				 * always be larger than our output buffer.
+				 */
 
-static uch *inbuf;	     /* input buffer */
-static uch window[WSIZE];    /* Sliding window buffer */
+static uch *inbuf;	/* input buffer */
+static uch *window;	/* Sliding window buffer, (and final output buffer) */
 
-static unsigned insize = 0;  /* valid bytes in inbuf */
-static unsigned inptr = 0;   /* index of next byte to be processed in inbuf */
-static unsigned outcnt = 0;  /* bytes in output buffer */
+static unsigned insize;  /* valid bytes in inbuf */
+static unsigned inptr;   /* index of next byte to be processed in inbuf */
+static unsigned outcnt;  /* bytes in output buffer */
 
 /* gzip flag byte */
 #define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
@@ -87,8 +177,6 @@ extern unsigned char input_data[];
 extern int input_len;
 
 static long bytes_out = 0;
-static uch *output_data;
-static unsigned long output_ptr = 0;
 
 static void *malloc(int size);
 static void free(void *where);
@@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n);
 
 static void putstr(const char *);
 
-extern int end;
-static long free_mem_ptr = (long)&end;
+static long free_mem_ptr;
 static long free_mem_end_ptr;
 
-#define INPLACE_MOVE_ROUTINE  0x1000
-#define LOW_BUFFER_START      0x2000
-#define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
-static unsigned int low_buffer_end, low_buffer_size;
-static int high_loaded =0;
-static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
+#define HEAP_SIZE             0x6000
 
 static char *vidmem = (char *)0xb8000;
 static int vidport;
@@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
  */
 static int fill_inbuf(void)
 {
-	if (insize != 0) {
-		error("ran out of input data");
-	}
-
-	inbuf = input_data;
-	insize = input_len;
-	inptr = 1;
-	return inbuf[0];
+	error("ran out of input data");
+	return 0;
 }
 
 /* ===========================================================================
  * Write the output window window[0..outcnt-1] and update crc and bytes_out.
  * (Used for the decompressed data only.)
  */
-static void flush_window_low(void)
-{
-    ulg c = crc;         /* temporary variable */
-    unsigned n;
-    uch *in, *out, ch;
-    
-    in = window;
-    out = &output_data[output_ptr]; 
-    for (n = 0; n < outcnt; n++) {
-	    ch = *out++ = *in++;
-	    c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-    }
-    crc = c;
-    bytes_out += (ulg)outcnt;
-    output_ptr += (ulg)outcnt;
-    outcnt = 0;
-}
-
-static void flush_window_high(void)
-{
-    ulg c = crc;         /* temporary variable */
-    unsigned n;
-    uch *in,  ch;
-    in = window;
-    for (n = 0; n < outcnt; n++) {
-	ch = *output_data++ = *in++;
-	if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
-	c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-    }
-    crc = c;
-    bytes_out += (ulg)outcnt;
-    outcnt = 0;
-}
-
 static void flush_window(void)
 {
-	if (high_loaded) flush_window_high();
-	else flush_window_low();
+	/* With my window equal to my output buffer
+	 * I only need to compute the crc here.
+	 */
+	ulg c = crc;         /* temporary variable */
+	unsigned n;
+	uch *in, ch;
+
+	in = window;
+	for (n = 0; n < outcnt; n++) {
+		ch = *in++;
+		c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+	}
+	crc = c;
+	bytes_out += (ulg)outcnt;
+	outcnt = 0;
 }
 
 static void error(char *x)
@@ -281,57 +335,8 @@ static void error(char *x)
 	while(1);	/* Halt */
 }
 
-static void setup_normal_output_buffer(void)
-{
-#ifdef STANDARD_MEMORY_BIOS_CALL
-	if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
-#else
-	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
-#endif
-	output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
-	free_mem_end_ptr = (long)real_mode;
-}
-
-struct moveparams {
-	uch *low_buffer_start;  int lcount;
-	uch *high_buffer_start; int hcount;
-};
-
-static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
-{
-	high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
-#ifdef STANDARD_MEMORY_BIOS_CALL
-	if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
-#else
-	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
-#endif	
-	mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
-	low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
-	  ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
-	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
-	high_loaded = 1;
-	free_mem_end_ptr = (long)high_buffer_start;
-	if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
-		high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
-		mv->hcount = 0; /* say: we need not to move high_buffer */
-	}
-	else mv->hcount = -1;
-	mv->high_buffer_start = high_buffer_start;
-}
-
-static void close_output_buffer_if_we_run_high(struct moveparams *mv)
-{
-	if (bytes_out > low_buffer_size) {
-		mv->lcount = low_buffer_size;
-		if (mv->hcount)
-			mv->hcount = bytes_out - low_buffer_size;
-	} else {
-		mv->lcount = bytes_out;
-		mv->hcount = 0;
-	}
-}
-
-int decompress_kernel(struct moveparams *mv, void *rmode)
+asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
+	uch *input_data, unsigned long input_len, uch *output)
 {
 	real_mode = rmode;
 
@@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode)
 	lines = RM_SCREEN_INFO.orig_video_lines;
 	cols = RM_SCREEN_INFO.orig_video_cols;
 
-	if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
-	else setup_output_buffer_if_we_run_high(mv);
+	window = output;  		/* Output buffer (Normally at 1M) */
+	free_mem_ptr     = heap;	/* Heap  */
+	free_mem_end_ptr = heap + HEAP_SIZE;
+	inbuf  = input_data;		/* Input buffer */
+	insize = input_len;
+	inptr  = 0;
+
+	if ((ulg)output & 0x1fffffUL)
+		error("Destination address not 2M aligned");
+	if ((ulg)output >= 0xffffffffffUL)
+		error("Destination address too large");
 
 	makecrc();
 	putstr(".\nDecompressing Linux...");
 	gunzip();
 	putstr("done.\nBooting the kernel.\n");
-	if (high_loaded) close_output_buffer_if_we_run_high(mv);
-	return high_loaded;
+	return;
 }
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds
new file mode 100644
index 000000000000..94c13e557fb4
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.lds
@@ -0,0 +1,44 @@
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+SECTIONS
+{
+	/* Be careful parts of head.S assume startup_32 is at
+ 	 * address 0.
+	 */
+	. = 0;
+	.text :	{
+		_head = . ;
+		*(.text.head)
+		_ehead = . ;
+		*(.text.compressed)
+		_text = .; 	/* Text */
+		*(.text)
+		*(.text.*)
+		_etext = . ;
+	}
+	.rodata : {
+		_rodata = . ;
+		*(.rodata)	 /* read-only data */
+		*(.rodata.*)
+		_erodata = . ;
+	}
+	.data :	{
+		_data = . ;
+		*(.data)
+		*(.data.*)
+		_edata = . ;
+	}
+	.bss : {
+		_bss = . ;
+		*(.bss)
+		*(.bss.*)
+		*(COMMON)
+		. = ALIGN(8);
+		_end = . ;
+		. = ALIGN(4096);
+		pgtable = . ;
+		. = . + 4096 * 6;
+		_heap = .;
+	}
+}
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
index 1ed9d791f863..bd1429ce193e 100644
--- a/arch/x86_64/boot/compressed/vmlinux.scr
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
 SECTIONS
 {
-  .data : { 
+  .text.compressed : {
 	input_len = .;
-	LONG(input_data_end - input_data) input_data = .; 
-	*(.data) 
-	input_data_end = .; 
+	LONG(input_data_end - input_data) input_data = .;
+	*(.data)
+	output_len = . - 4;
+	input_data_end = .;
 	}
 }
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c211e52f1333..36aa98a6d15c 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,6 +5,7 @@
  *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
  *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
  *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
  */
 
 
@@ -17,95 +18,127 @@
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
-	
+
 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
- * because we need identity-mapped pages on setup so define __START_KERNEL to
- * 0x100000 for this stage
- * 
+ * because we need identity-mapped pages.
+ *
  */
 
 	.text
 	.section .bootstrap.text
-	.code32
-	.globl startup_32
-/* %bx:	 1 if coming from smp trampoline on secondary cpu */ 
-startup_32:
-	
+	.code64
+	.globl startup_64
+startup_64:
+
 	/*
-	 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
-	 * paging disabled and the point of this file is to switch to 64bit
-	 * long mode with a kernel mapping for kerneland to jump into the
-	 * kernel virtual addresses.
- 	 * There is no stack until we set one up.
+	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+	 * and someone has loaded an identity mapped page table
+	 * for us.  These identity mapped page tables map all of the
+	 * kernel pages and possibly all of memory.
+	 *
+	 * %esi holds a physical pointer to real_mode_data.
+	 *
+	 * We come here either directly from a 64bit bootloader, or from
+	 * arch/x86_64/boot/compressed/head.S.
+	 *
+	 * We only come here initially at boot nothing else comes here.
+	 *
+	 * Since we may be loaded at an address different from what we were
+	 * compiled to run at we first fixup the physical addresses in our page
+	 * tables and then reload them.
 	 */
 
-	/* Initialize the %ds segment register */
-	movl $__KERNEL_DS,%eax
-	movl %eax,%ds
-
-	/* Load new GDT with the 64bit segments using 32bit descriptor */
-	lgdt	pGDT32 - __START_KERNEL_map
-
-	/* If the CPU doesn't support CPUID this will double fault.
-	 * Unfortunately it is hard to check for CPUID without a stack. 
+	/* Compute the delta between the address I am compiled to run at and the
+	 * address I am actually running at.
 	 */
-	
-	/* Check if extended functions are implemented */		
-	movl	$0x80000000, %eax
-	cpuid
-	cmpl	$0x80000000, %eax
-	jbe	no_long_mode
-	/* Check if long mode is implemented */
-	mov	$0x80000001, %eax
-	cpuid
-	btl	$29, %edx
-	jnc	no_long_mode
-
-	/*
-	 * Prepare for entering 64bits mode
+	leaq	_text(%rip), %rbp
+	subq	$_text - __START_KERNEL_map, %rbp
+
+	/* Is the address not 2M aligned? */
+	movq	%rbp, %rax
+	andl	$~LARGE_PAGE_MASK, %eax
+	testl	%eax, %eax
+	jnz	bad_address
+
+	/* Is the address too large? */
+	leaq	_text(%rip), %rdx
+	movq	$PGDIR_SIZE, %rax
+	cmpq	%rax, %rdx
+	jae	bad_address
+
+	/* Fixup the physical addresses in the page table
 	 */
+	addq	%rbp, init_level4_pgt + 0(%rip)
+	addq	%rbp, init_level4_pgt + (258*8)(%rip)
+	addq	%rbp, init_level4_pgt + (511*8)(%rip)
+
+	addq	%rbp, level3_ident_pgt + 0(%rip)
+	addq	%rbp, level3_kernel_pgt + (510*8)(%rip)
+
+	/* Add an Identity mapping if I am above 1G */
+	leaq	_text(%rip), %rdi
+	andq	$LARGE_PAGE_MASK, %rdi
+
+	movq	%rdi, %rax
+	shrq	$PUD_SHIFT, %rax
+	andq	$(PTRS_PER_PUD - 1), %rax
+	jz	ident_complete
+
+	leaq	(level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq	level3_ident_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %rax, 8)
+
+	movq	%rdi, %rax
+	shrq	$PMD_SHIFT, %rax
+	andq	$(PTRS_PER_PMD - 1), %rax
+	leaq	__PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+	leaq	level2_spare_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %rax, 8)
+ident_complete:
+
+	/* Fixup the kernel text+data virtual addresses
+	 */
+	leaq	level2_kernel_pgt(%rip), %rdi
+	leaq	4096(%rdi), %r8
+	/* See if it is a valid page table entry */
+1:	testq	$1, 0(%rdi)
+	jz	2f
+	addq	%rbp, 0(%rdi)
+	/* Go to the next page */
+2:	addq	$8, %rdi
+	cmp	%r8, %rdi
+	jne	1b
+
+	/* Fixup phys_base */
+	addq	%rbp, phys_base(%rip)
 
-	/* Enable PAE mode */
-	xorl	%eax, %eax
-	btsl	$5, %eax
-	movl	%eax, %cr4
-
-	/* Setup early boot stage 4 level pagetables */
-	movl	$(init_level4_pgt - __START_KERNEL_map), %eax
-	movl	%eax, %cr3
-
-	/* Setup EFER (Extended Feature Enable Register) */
-	movl	$MSR_EFER, %ecx
-	rdmsr
-
-	/* Enable Long Mode */
-	btsl	$_EFER_LME, %eax
-				
-	/* Make changes effective */
-	wrmsr
+#ifdef CONFIG_SMP
+	addq	%rbp, trampoline_level4_pgt + 0(%rip)
+	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+	addq	%rbp, wakeup_level4_pgt + 0(%rip)
+	addq	%rbp, wakeup_level4_pgt + (511*8)(%rip)
+#endif
 
-	xorl	%eax, %eax
-	btsl	$31, %eax			/* Enable paging and in turn activate Long Mode */
-	btsl	$0, %eax			/* Enable protected mode */
-	/* Make changes effective */
-	movl	%eax, %cr0
-	/*
-	 * At this point we're in long mode but in 32bit compatibility mode
-	 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
-	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
-	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+	/* Due to ENTRY(), sometimes the empty space gets filled with
+	 * zeros. Better take a jmp than relying on empty space being
+	 * filled with 0x90 (nop)
 	 */
-	ljmp	$__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
-
-	.code64
-	.org 0x100	
-	.globl startup_64
-startup_64:
+	jmp secondary_startup_64
 ENTRY(secondary_startup_64)
-	/* We come here either from startup_32
-	 * or directly from a 64bit bootloader.
-	 * Since we may have come directly from a bootloader we
-	 * reload the page tables here.
+	/*
+	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+	 * and someone has loaded a mapped page table.
+	 *
+	 * %esi holds a physical pointer to real_mode_data.
+	 *
+	 * We come here either from startup_64 (using physical addresses)
+	 * or from trampoline.S (using virtual addresses).
+	 *
+	 * Using virtual addresses from trampoline.S removes the need
+	 * to have any identity mapped pages in the kernel page table
+	 * after the boot processor executes this code.
 	 */
 
 	/* Enable PAE mode and PGE */
@@ -116,8 +149,14 @@ ENTRY(secondary_startup_64)
 
 	/* Setup early boot stage 4 level pagetables. */
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
+	addq	phys_base(%rip), %rax
 	movq	%rax, %cr3
 
+	/* Ensure I am executing from virtual addresses */
+	movq	$1f, %rax
+	jmp	*%rax
+1:
+
 	/* Check if nx is implemented */
 	movl	$0x80000001, %eax
 	cpuid
@@ -126,17 +165,11 @@ ENTRY(secondary_startup_64)
 	/* Setup EFER (Extended Feature Enable Register) */
 	movl	$MSR_EFER, %ecx
 	rdmsr
-
-	/* Enable System Call */
-	btsl	$_EFER_SCE, %eax
-
-	/* No Execute supported? */
-	btl	$20,%edi
+	btsl	$_EFER_SCE, %eax	/* Enable System Call */
+	btl	$20,%edi		/* No Execute supported? */
 	jnc     1f
 	btsl	$_EFER_NX, %eax
-1:
-	/* Make changes effective */
-	wrmsr
+1:	wrmsr				/* Make changes effective */
 
 	/* Setup cr0 */
 #define CR0_PM				1		/* protected mode */
@@ -163,7 +196,7 @@ ENTRY(secondary_startup_64)
 	 * addresses where we're currently running on. We have to do that here
 	 * because in 32bit we couldn't load a 64bit linear address.
 	 */
-	lgdt	cpu_gdt_descr
+	lgdt	cpu_gdt_descr(%rip)
 
 	/* set up data segments. actually 0 would do too */
 	movl $__KERNEL_DS,%eax
@@ -214,6 +247,9 @@ initial_code:
 init_rsp:
 	.quad  init_thread_union+THREAD_SIZE-8
 
+bad_address:
+	jmp bad_address
+
 ENTRY(early_idt_handler)
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
@@ -242,23 +278,7 @@ early_idt_msg:
 early_idt_ripmsg:
 	.asciz "RIP %s\n"
 
-.code32
-ENTRY(no_long_mode)
-	/* This isn't an x86-64 CPU so hang */
-1:
-	jmp	1b
-
-.org 0xf00
-	.globl pGDT32
-pGDT32:
-	.word	gdt_end-cpu_gdt_table-1
-	.long	cpu_gdt_table-__START_KERNEL_map
-
-.org 0xf10	
-ljumpvector:
-	.long	startup_64-__START_KERNEL_map
-	.word	__KERNEL_CS
-
+.balign PAGE_SIZE
 ENTRY(stext)
 ENTRY(_stext)
 
@@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt)
 	 * Don't set NX because code runs from these pages.
 	 */
 	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
-	
+
 NEXT_PAGE(level2_kernel_pgt)
 	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
 	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
@@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt)
 	/* Module mapping starts here */
 	.fill	(PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
 
+NEXT_PAGE(level2_spare_pgt)
+	.fill   512,8,0
+
 #undef PMDS
 #undef NEXT_PAGE
 
@@ -330,6 +353,10 @@ gdt:
 	.endr
 #endif
 
+ENTRY(phys_base)
+	/* This must match the first entry in level2_kernel_pgt */
+	.quad   0x0000000000000000
+
 /* We need valid kernel segments for data and code in long mode too
  * IRET will check the segment types  kkeil 2000/10/28
  * Also sysret mandates a special GDT layout 
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index bfbe00763c68..16d183f67bc1 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -71,9 +71,10 @@ loop:
 	jmp	loop
 done:
 	/* go back to the original page tables */
-	leaq	init_level4_pgt(%rip), %rax
-	subq	$__START_KERNEL_map, %rax
-	movq	%rax, %cr3
+	movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+	addq    phys_base(%rip), %rax
+	movq    %rax, %cr3
+
 	/* Flush TLB, including "global" things (vmalloc) */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 4974433bbf34..40a24d0df090 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t;
 
 typedef struct { unsigned long pgprot; } pgprot_t;
 
+extern unsigned long phys_base;
+
 #define pte_val(x)	((x).pte)
 #define pmd_val(x)	((x).pmd)
 #define pud_val(x)	((x).pud)
@@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define PAGE_OFFSET		__PAGE_OFFSET
 
 /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
-   Otherwise you risk miscompilation. */ 
+   Otherwise you risk miscompilation. */
 #define __pa(x)			((unsigned long)(x) - PAGE_OFFSET)
 /* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */ 
 #define __pa_symbol(x)		\
 	({unsigned long v;  \
 	  asm("" : "=r" (v) : "0" (x)); \
-	  (v - __START_KERNEL_map); })
+	  ((v - __START_KERNEL_map) + phys_base); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #ifdef CONFIG_FLATMEM
-- 
cgit v1.2.3


From 6a50a664ca0cfd2a487525f10cec3ff4d570b5e8 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:08 +0200
Subject: [PATCH] x86-64: build-time checking

o X86_64 kernel should run from 2MB aligned address for two reasons.
	- Performance.
	- For relocatable kernels, page tables are updated based on difference
	  between compile time address and load time physical address.
	  This difference should be multiple of 2MB as kernel text and data
	  is mapped using 2MB pages and PMD should be pointing to a 2MB
	  aligned address. Life is simpler if both compile time and load time
	  kernel addresses are 2MB aligned.

o Flag the error at compile time if one is trying to build a kernel which
  does not meet alignment restrictions.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86_64/boot/compressed/misc.c | 2 +-
 arch/x86_64/kernel/head64.c        | 7 +++++++
 include/asm-x86_64/page.h          | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index fee54dbf1749..fed1167159c3 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -358,7 +358,7 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
 	insize = input_len;
 	inptr  = 0;
 
-	if ((ulg)output & 0x1fffffUL)
+	if ((ulg)output & (__KERNEL_ALIGN - 1))
 		error("Destination address not 2M aligned");
 	if ((ulg)output >= 0xffffffffffUL)
 		error("Destination address too large");
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 6c34bdd22e26..213d90e04755 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -62,6 +62,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
 
+	/*
+	 * Make sure kernel is aligned to 2MB address. Catching it at compile
+	 * time is better. Change your config file and compile the kernel
+	 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
+	 */
+	BUILD_BUG_ON(CONFIG_PHYSICAL_START & (__KERNEL_ALIGN - 1));
+
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 40a24d0df090..b17fc16ec2eb 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -78,6 +78,7 @@ extern unsigned long phys_base;
 #endif /* !__ASSEMBLY__ */
 
 #define __PHYSICAL_START	CONFIG_PHYSICAL_START
+#define __KERNEL_ALIGN		0x200000
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	0xffffffff80000000
 #define __PAGE_OFFSET           0xffff810000000000
-- 
cgit v1.2.3


From 5a90cf205c922707ffed2d8f87cefd942e96b0ba Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 2 May 2007 19:27:08 +0200
Subject: [PATCH] x86: Log reason why TSC was marked unstable

Change mark_tsc_unstable() so it takes a string argument, which holds the
reason the TSC was marked unstable.

This is then displayed the first time mark_tsc_unstable is called.

This should help us better debug why the TSC was marked unstable on certain
systems and allow us to make sure we're not being overly paranoid when
throwing out this troublesome clocksource.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/cpu/cyrix.c                | 2 +-
 arch/i386/kernel/tsc.c                      | 5 +++--
 arch/x86_64/kernel/time.c                   | 2 +-
 arch/x86_64/kernel/tsc.c                    | 5 +++--
 arch/x86_64/kernel/tsc_sync.c               | 2 +-
 drivers/acpi/processor_idle.c               | 4 ++--
 include/asm-i386/mach-summit/mach_mpparse.h | 4 ++--
 include/asm-i386/tsc.h                      | 2 +-
 include/asm-x86_64/timex.h                  | 2 +-
 9 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index f0badfdd4e45..e77f8e1cf7aa 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 		 */  
 		if (vendor == PCI_VENDOR_ID_CYRIX &&
 	 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
-			mark_tsc_unstable();
+			mark_tsc_unstable("cyrix 5510/5520 detected");
 	}
 #endif
 		c->x86_cache_size=16;	/* Yep 16K integrated cache thats it */
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6cb8f5336732..755209dc93e1 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -233,7 +233,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
 				 * TSC based sched_clock turns
 				 * to junk w/ cpufreq
 				 */
-				mark_tsc_unstable();
+				mark_tsc_unstable("cpufreq changes");
 			}
 		}
 	}
@@ -281,11 +281,12 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 };
 
-void mark_tsc_unstable(void)
+void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
+		printk("Marking TSC unstable due to: %s.\n", reason);
 		/* Can be called before registration */
 		if (clocksource_tsc.mult)
 			clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 5f862e216a42..91c9066a380d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -397,7 +397,7 @@ void __init time_init(void)
 		cpu_khz = tsc_calibrate_cpu_khz();
 
 	if (unsynchronized_tsc())
-		mark_tsc_unstable();
+		mark_tsc_unstable("TSCs unsynchronized");
 
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
 		vgetcpu_mode = VGETCPU_RDTSCP;
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 5c84992c676d..48f9a8e6aa91 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -111,7 +111,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 
 		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			mark_tsc_unstable();
+			mark_tsc_unstable("cpufreq changes");
 	}
 
 	set_cyc2ns_scale(tsc_khz_ref);
@@ -199,10 +199,11 @@ static struct clocksource clocksource_tsc = {
 	.vread			= vread_tsc,
 };
 
-void mark_tsc_unstable(void)
+void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
+		printk("Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
 			clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
index 72d444dede9b..355f5f506c81 100644
--- a/arch/x86_64/kernel/tsc_sync.c
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -138,7 +138,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
 		printk("\n");
 		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
 				    " turning off TSC clock.\n", max_warp);
-		mark_tsc_unstable();
+		mark_tsc_unstable("check_tsc_sync_source failed");
 		nr_warps = 0;
 		max_warp = 0;
 		last_tsc = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ae0654cd11ea..ee5759bef945 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -475,7 +475,7 @@ static void acpi_processor_idle(void)
 
 #ifdef CONFIG_GENERIC_TIME
 		/* TSC halts in C2, so notify users */
-		mark_tsc_unstable();
+		mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Re-enable interrupts */
 		local_irq_enable();
@@ -517,7 +517,7 @@ static void acpi_processor_idle(void)
 
 #ifdef CONFIG_GENERIC_TIME
 		/* TSC halts in C3, so notify users */
-		mark_tsc_unstable();
+		mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Re-enable interrupts */
 		local_irq_enable();
diff --git a/include/asm-i386/mach-summit/mach_mpparse.h b/include/asm-i386/mach-summit/mach_mpparse.h
index 94268399170d..c2520539d934 100644
--- a/include/asm-i386/mach-summit/mach_mpparse.h
+++ b/include/asm-i386/mach-summit/mach_mpparse.h
@@ -30,7 +30,7 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 			(!strncmp(productid, "VIGIL SMP", 9) 
 			 || !strncmp(productid, "EXA", 3)
 			 || !strncmp(productid, "RUTHLESS SMP", 12))){
-		mark_tsc_unstable();
+		mark_tsc_unstable("Summit based system");
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
@@ -44,7 +44,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	if (!strncmp(oem_id, "IBM", 3) &&
 	    (!strncmp(oem_table_id, "SERVIGIL", 8)
 	     || !strncmp(oem_table_id, "EXA", 3))){
-		mark_tsc_unstable();
+		mark_tsc_unstable("Summit based system");
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index 84016ff481b9..346976632e15 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -53,7 +53,7 @@ static __always_inline cycles_t get_cycles_sync(void)
 }
 
 extern void tsc_init(void);
-extern void mark_tsc_unstable(void);
+extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
 extern void init_tsc_clocksource(void);
 
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index 8c6808a3fba4..f6527e1b6c1c 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -27,6 +27,6 @@ extern int read_current_timer(unsigned long *timer_value);
 #define NS_SCALE        10 /* 2^10, carefully chosen */
 #define US_SCALE        32 /* 2^32, arbitralrily chosen */
 
-extern void mark_tsc_unstable(void);
+extern void mark_tsc_unstable(char *msg);
 extern void set_cyc2ns_scale(unsigned long khz);
 #endif
-- 
cgit v1.2.3


From 8b8ca80e192b10eecc01fc44a2902510af86f73b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 May 2007 19:27:09 +0200
Subject: [PATCH] x86-64: configurable fake numa node sizes

Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for coarse
memory resource management.

The old command-line option is still supported:
  numa=fake=32	gives 32 fake NUMA nodes, ignoring the NUMA setup of the
		actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
		gives two 512M nodes, one 1024M node, two 256M nodes, and
		the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The total
available pages on the system is calculated and divided by the number of equal
nodes to allocate.  These nodes are then dynamically allocated and their
borders extended until such time as their number of available pages reaches
the required size.

Configurable node sizes are recommended when used in conjunction with cpusets
for memory control because it eliminates the overhead associated with scanning
the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/x86_64/boot-options.txt |   8 +-
 arch/x86_64/mm/numa.c                 | 261 ++++++++++++++++++++--------------
 include/asm-x86_64/mmzone.h           |   2 +-
 3 files changed, 161 insertions(+), 110 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 85f51e5a749f..7500aad95f3c 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+		If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+		actual machine.  Otherwise, system memory is configured
+		depending on the sizes and coefficients listed.  For example:
+			numa=fake=2*512,1024,4*256
+		gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+		remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
 		Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb069924..c55936bc6be6 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,172 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)					\
+	(e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) <<	\
+	PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+				   u64 size, u64 max_addr)
 {
-	if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
-			(end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
-		return 1;
-	return 0;
+	int ret = 0;
+	nodes[nid].start = *addr;
+	*addr += size;
+	if (*addr >= max_addr) {
+		*addr = max_addr;
+		ret = -1;
+	}
+	nodes[nid].end = *addr;
+	node_set_online(nid);
+	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+	       nodes[nid].start, nodes[nid].end,
+	       (nodes[nid].end - nodes[nid].start) >> 20);
+	return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+				      u64 max_addr, int node_start,
+				      int num_nodes)
 {
- 	int i, big;
- 	struct bootnode nodes[MAX_NUMNODES];
- 	unsigned long sz, old_sz;
-	unsigned long hole_size;
-	unsigned long start, end;
-	unsigned long max_addr = (end_pfn << PAGE_SHIFT);
-
-	start = (start_pfn << PAGE_SHIFT);
-	hole_size = e820_hole_size(start, max_addr);
-	sz = (max_addr - start - hole_size) / numa_fake;
-
- 	/* Kludge needed for the hash function */
-
-	old_sz = sz;
-	/*
-	 * Round down to the nearest FAKE_NODE_MIN_SIZE.
-	 */
-	sz &= FAKE_NODE_MIN_HASH_MASK;
+	unsigned int big;
+	u64 size;
+	int i;
 
+	if (num_nodes <= 0)
+		return -1;
+	if (num_nodes > MAX_NUMNODES)
+		num_nodes = MAX_NUMNODES;
+	size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
+	       num_nodes;
 	/*
-	 * We ensure that each node is at least 64MB big.  Smaller than this
-	 * size can cause VM hiccups.
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the leftovers.
 	 */
-	if (sz == 0) {
-		printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
-				"the number of nodes\n", numa_fake);
-		numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
-		printk(KERN_INFO "Number of fake nodes will be = %d\n",
-				numa_fake);
-		sz = FAKE_NODE_MIN_SIZE;
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
+	      FAKE_NODE_MIN_SIZE;
+
+	/* Round down to nearest FAKE_NODE_MIN_SIZE. */
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		printk(KERN_ERR "Not enough memory for each node.  "
+		       "NUMA emulation disabled.\n");
+		return -1;
 	}
-	/*
-	 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
-	 * This logic ensures the extra memory gets distributed among as many
-	 * nodes as possible (as compared to one single node getting all that
-	 * extra memory.
-	 */
-	big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
-	printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
-			"%d\n",
-			(sz >> 20), (hole_size >> 20), big);
- 	memset(&nodes,0,sizeof(nodes));
-	end = start;
- 	for (i = 0; i < numa_fake; i++) {
-		/*
-		 * In case we are not able to allocate enough memory for all
-		 * the nodes, we reduce the number of fake nodes.
-		 */
-		if (end >= max_addr) {
-			numa_fake = i - 1;
-			break;
-		}
- 		start = nodes[i].start = end;
-		/*
-		 * Final node can have all the remaining memory.
-		 */
- 		if (i == numa_fake-1)
- 			sz = max_addr - start;
- 		end = nodes[i].start + sz;
-		/*
-		 * Fir "big" number of nodes get extra granule.
-		 */
+
+	for (i = node_start; i < num_nodes + node_start; i++) {
+		u64 end = *addr + size;
 		if (i < big)
 			end += FAKE_NODE_MIN_SIZE;
 		/*
-		 * Iterate over the range to ensure that this node gets at
-		 * least sz amount of RAM (excluding holes)
+		 * The final node can have the remaining system RAM.  Other
+		 * nodes receive roughly the same amount of available pages.
 		 */
-		while ((end - start - e820_hole_size(start, end)) < sz) {
-			end += FAKE_NODE_MIN_SIZE;
-			if (end >= max_addr)
-				break;
+		if (i == num_nodes + node_start - 1)
+			end = max_addr;
+		else
+			while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
+			       size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > max_addr) {
+					end = max_addr;
+					break;
+				}
+			}
+		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+			break;
+	}
+	return i - node_start + 1;
+}
+
+/*
+ * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * numa=fake command-line option.
+ */
+static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+	struct bootnode nodes[MAX_NUMNODES];
+	u64 addr = start_pfn << PAGE_SHIFT;
+	u64 max_addr = end_pfn << PAGE_SHIFT;
+	unsigned int coeff;
+	unsigned int num = 0;
+	int num_nodes = 0;
+	u64 size;
+	int i;
+
+	memset(&nodes, 0, sizeof(nodes));
+	/*
+	 * If the numa=fake command-line is just a single number N, split the
+	 * system RAM into N fake nodes.
+	 */
+	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
+		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
+						simple_strtol(cmdline, NULL, 0));
+		if (num_nodes < 0)
+			return num_nodes;
+		goto out;
+	}
+
+	/* Parse the command line. */
+	for (coeff = 1; ; cmdline++) {
+		if (*cmdline && isdigit(*cmdline)) {
+			num = num * 10 + *cmdline - '0';
+			continue;
 		}
-		/*
-		 * Look at the next node to make sure there is some real memory
-		 * to map.  Bad things happen when the only memory present
-		 * in a zone on a fake node is IO hole.
-		 */
-		while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
-			if (zone_cross_over(start, end + sz)) {
-				end = (MAX_DMA32_PFN << PAGE_SHIFT);
-				break;
+		if (*cmdline == '*')
+			coeff = num;
+		if (!*cmdline || *cmdline == ',') {
+			/*
+			 * Round down to the nearest FAKE_NODE_MIN_SIZE.
+			 * Command-line coefficients are in megabytes.
+			 */
+			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
+			if (size) {
+				for (i = 0; i < coeff; i++, num_nodes++)
+					if (setup_node_range(num_nodes, nodes,
+						&addr, size, max_addr) < 0)
+						goto done;
+				coeff = 1;
 			}
-			if (end >= max_addr)
-				break;
-			end += FAKE_NODE_MIN_SIZE;
 		}
-		if (end > max_addr)
-			end = max_addr;
-		nodes[i].end = end;
- 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
- 		       i,
- 		       nodes[i].start, nodes[i].end,
- 		       (nodes[i].end - nodes[i].start) >> 20);
-		node_set_online(i);
- 	}
- 	memnode_shift = compute_hash_shift(nodes, numa_fake);
- 	if (memnode_shift < 0) {
- 		memnode_shift = 0;
- 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
- 		return -1;
- 	}
- 	for_each_online_node(i) {
+		if (!*cmdline)
+			break;
+		num = 0;
+	}
+done:
+	if (!num_nodes)
+		return -1;
+	/* Fill remainder of system RAM with a final node, if appropriate. */
+	if (addr < max_addr) {
+		setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
+				 max_addr);
+		num_nodes++;
+	}
+out:
+	memnode_shift = compute_hash_shift(nodes, num_nodes);
+	if (memnode_shift < 0) {
+		memnode_shift = 0;
+		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
+		       "disabled.\n");
+		return -1;
+	}
+
+	/*
+	 * We need to vacate all active ranges that may have been registered by
+	 * SRAT.
+	 */
+	remove_all_active_ranges();
+	for_each_online_node(i) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  	numa_init_array();
  	return 0;
 }
-#endif
+#undef E820_ADDR_HOLE_SIZE
+#endif /* CONFIG_NUMA_EMU */
 
 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 { 
 	int i;
 
 #ifdef CONFIG_NUMA_EMU
-	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+	if (cmdline && !numa_emulation(start_pfn, end_pfn))
  		return;
 #endif
 
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt)
 	if (!strncmp(opt,"off",3))
 		numa_off = 1;
 #ifdef CONFIG_NUMA_EMU
-	if(!strncmp(opt, "fake=", 5)) {
-		numa_fake = simple_strtoul(opt+5,NULL,0); ;
-		if (numa_fake >= MAX_NUMNODES)
-			numa_fake = MAX_NUMNODES;
-	}
+	if (!strncmp(opt, "fake=", 5))
+		cmdline = opt + 5;
 #endif
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index fb558fb1d211..19a89377b123 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	(64*1024*1024)
-#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1ul))
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1uL))
 #endif
 
 #endif
-- 
cgit v1.2.3


From eab0c72aecd7982b2c848f7d493ba379efcef15e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 May 2007 19:27:09 +0200
Subject: [PATCH] x86-64: Introduce load_TLS to the "for" loop.

GCC (4.1 at least) unrolls it anyway, but I can't believe this code
was ever justifiable.  (I've also submitted a patch which cleans up
i386, which is even uglier).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-x86_64/desc.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index 7726e74db536..ac991b5ca0fd 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -135,16 +135,13 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
 	(info)->useable		== 0	&& \
 	(info)->lm		== 0)
 
-#if TLS_SIZE != 24
-# error update this code.
-#endif
-
 static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
 {
+	unsigned int i;
 	u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
-	gdt[0] = t->tls_array[0];
-	gdt[1] = t->tls_array[1];
-	gdt[2] = t->tls_array[2];
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		gdt[i] = t->tls_array[i];
 } 
 
 /*
-- 
cgit v1.2.3


From 2bff73830c3df5f575d3bc21bf19df1a10bf7091 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 2 May 2007 19:27:10 +0200
Subject: [PATCH] x86-64: use lru instead of page->index and page->private for
 pgd lists management.

x86_64 currently simulates a list using the index and private fields of the
page struct.  Seems that the code was inherited from i386.  But x86_64 does
not use the slab to allocate pgds and pmds etc.  So the lru field is not
used by the slab and therefore available.

This patch uses standard list operations on page->lru to realize pgd
tracking.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86_64/mm/fault.c       |  5 ++---
 include/asm-x86_64/pgalloc.h | 14 +++-----------
 include/asm-x86_64/pgtable.h |  2 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada7231f3ab..de99dba2c515 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -585,7 +585,7 @@ do_sigbus:
 }
 
 DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
+LIST_HEAD(pgd_list);
 
 void vmalloc_sync_all(void)
 {
@@ -605,8 +605,7 @@ void vmalloc_sync_all(void)
 			if (pgd_none(*pgd_ref))
 				continue;
 			spin_lock(&pgd_lock);
-			for (page = pgd_list; page;
-			     page = (struct page *)page->index) {
+			list_for_each_entry(page, &pgd_list, lru) {
 				pgd_t *pgd;
 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
 				if (pgd_none(*pgd))
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 31d497171969..8bb564687860 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -44,24 +44,16 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 
 	spin_lock(&pgd_lock);
-	page->index = (pgoff_t)pgd_list;
-	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
-	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	list_add(&page->lru, &pgd_list);
 	spin_unlock(&pgd_lock);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
-	struct page *next, **pprev, *page = virt_to_page(pgd);
+	struct page *page = virt_to_page(pgd);
 
 	spin_lock(&pgd_lock);
-	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
-	*pprev = next;
-	if (next)
-		next->private = (unsigned long)pprev;
+	list_del(&page->lru);
 	spin_unlock(&pgd_lock);
 }
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index c1865e38c7b7..599993f6ba84 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -410,7 +410,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
 extern spinlock_t pgd_lock;
-extern struct page *pgd_list;
+extern struct list_head pgd_list;
 void vmalloc_sync_all(void);
 
 extern int kern_addr_valid(unsigned long addr); 
-- 
cgit v1.2.3


From ca906e42312781c38b7a9625109fc65b937ca56c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 2 May 2007 19:27:10 +0200
Subject: [PATCH] x86: sys_ioperm() prototype cleanup

- there's no reason for duplicating the prototype from
  include/linux/syscalls.h in include/asm-x86_64/unistd.h
- every file should #include the headers containing the prototypes for
  it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/ioport.c   | 1 +
 arch/x86_64/kernel/ioport.c | 1 +
 include/asm-x86_64/unistd.h | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 498e8bc197d5..1b4530e6cd82 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -16,6 +16,7 @@
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
+#include <linux/syscalls.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
 static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index 745b1f0f494e..387d347b0e07 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -16,6 +16,7 @@
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
+#include <linux/syscalls.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
 static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index c5f596e71faa..576b29732d3c 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -655,7 +655,6 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
 #include <asm/ptrace.h>
 
 asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs);
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on);
 struct sigaction;
 asmlinkage long sys_rt_sigaction(int sig,
 				const struct sigaction __user *act,
-- 
cgit v1.2.3


From b00742d399513a4100c24cc2accefdc1bb1e0b15 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:11 +0200
Subject: [PATCH] x86-64: Account for module percpu space separately from
 kernel percpu

Rather than using a single constant PERCPU_ENOUGH_ROOM, compute it as
the sum of kernel_percpu + PERCPU_MODULE_RESERVE.  This is now common
to all architectures; if an architecture wants to set
PERCPU_ENOUGH_ROOM to something special, then it may do so (ia64 is
the only one which does).

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>
---
 include/asm-alpha/percpu.h   | 14 --------------
 include/asm-sparc64/percpu.h | 10 ----------
 include/asm-x86_64/percpu.h  | 10 ----------
 include/linux/percpu.h       |  9 ++++++++-
 kernel/module.c              |  2 +-
 5 files changed, 9 insertions(+), 36 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h
index 651ebb141b24..48348fe34c19 100644
--- a/include/asm-alpha/percpu.h
+++ b/include/asm-alpha/percpu.h
@@ -1,20 +1,6 @@
 #ifndef __ALPHA_PERCPU_H
 #define __ALPHA_PERCPU_H
 
-/*
- * Increase the per cpu area for Alpha so that
- * modules using percpu area can load.
- */
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 #include <asm-generic/percpu.h>
 
 #endif /* __ALPHA_PERCPU_H */
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h
index 0d3df76aa47f..ced8cbde046d 100644
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -5,16 +5,6 @@
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 extern void setup_per_cpu_areas(void);
 
 extern unsigned long __per_cpu_base;
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 5ed0ef340842..c6fbb67eac90 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -11,16 +11,6 @@
 
 #include <asm/pda.h>
 
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
 #define __my_cpu_offset() read_pda(data_offset)
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 600e3d387ffc..b72be2f79e6a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -11,9 +11,16 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#ifdef CONFIG_MODULES
+#define PERCPU_MODULE_RESERVE	8192
+#else
+#define PERCPU_MODULE_RESERVE	0
 #endif
 
+#define PERCPU_ENOUGH_ROOM						\
+	(__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
+#endif	/* PERCPU_ENOUGH_ROOM */
+
 /*
  * Must be an lvalue. Since @var must be a simple identifier,
  * we force a syntax error here if it isn't.
diff --git a/kernel/module.c b/kernel/module.c
index 9da5af668a20..cf49ca25fcce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,7 +430,7 @@ static int percpu_modinit(void)
 	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
 			    GFP_KERNEL);
 	/* Static in-kernel percpu data (used). */
-	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
 	/* Free room. */
 	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
 	if (pcpu_size[1] < 0) {
-- 
cgit v1.2.3


From 5d02d7ae73ac9446f20bbf604b04a74637178b35 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:11 +0200
Subject: [PATCH] x86-64: Use X86_EFLAGS_IF in x86-64/irqflags.h.

As per i386 patch: move X86_EFLAGS_IF et al out to a new header:
processor-flags.h, so we can include it from irqflags.h and use it in
raw_irqs_disabled_flags().

As a side-effect, we could now use these flags in .S files.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/irqflags.h        |  9 +++++----
 include/asm-x86_64/processor-flags.h | 26 ++++++++++++++++++++++++++
 include/asm-x86_64/processor.h       | 22 +---------------------
 3 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 include/asm-x86_64/processor-flags.h

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/irqflags.h b/include/asm-x86_64/irqflags.h
index cce6937e87c0..86e70fe23659 100644
--- a/include/asm-x86_64/irqflags.h
+++ b/include/asm-x86_64/irqflags.h
@@ -9,6 +9,7 @@
  */
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
+#include <asm/processor-flags.h>
 
 #ifndef __ASSEMBLY__
 /*
@@ -53,19 +54,19 @@ static inline void raw_local_irq_disable(void)
 {
 	unsigned long flags = __raw_local_save_flags();
 
-	raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
+	raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
 }
 
 static inline void raw_local_irq_enable(void)
 {
 	unsigned long flags = __raw_local_save_flags();
 
-	raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
+	raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
 }
 
 static inline int raw_irqs_disabled_flags(unsigned long flags)
 {
-	return !(flags & (1<<9)) || (flags & (1 << 18));
+	return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
 }
 
 #else /* CONFIG_X86_VSMP */
@@ -82,7 +83,7 @@ static inline void raw_local_irq_enable(void)
 
 static inline int raw_irqs_disabled_flags(unsigned long flags)
 {
-	return !(flags & (1 << 9));
+	return !(flags & X86_EFLAGS_IF);
 }
 
 #endif
diff --git a/include/asm-x86_64/processor-flags.h b/include/asm-x86_64/processor-flags.h
new file mode 100644
index 000000000000..806112fb7985
--- /dev/null
+++ b/include/asm-x86_64/processor-flags.h
@@ -0,0 +1,26 @@
+#ifndef __ASM_X86_64_PROCESSOR_FLAGS_H
+#define __ASM_X86_64_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF	0x00000001 /* Carry Flag */
+#define X86_EFLAGS_PF	0x00000004 /* Parity Flag */
+#define X86_EFLAGS_AF	0x00000010 /* Auxillary carry Flag */
+#define X86_EFLAGS_ZF	0x00000040 /* Zero Flag */
+#define X86_EFLAGS_SF	0x00000080 /* Sign Flag */
+#define X86_EFLAGS_TF	0x00000100 /* Trap Flag */
+#define X86_EFLAGS_IF	0x00000200 /* Interrupt Flag */
+#define X86_EFLAGS_DF	0x00000400 /* Direction Flag */
+#define X86_EFLAGS_OF	0x00000800 /* Overflow Flag */
+#define X86_EFLAGS_IOPL	0x00003000 /* IOPL mask */
+#define X86_EFLAGS_NT	0x00004000 /* Nested Task */
+#define X86_EFLAGS_RF	0x00010000 /* Resume Flag */
+#define X86_EFLAGS_VM	0x00020000 /* Virtual Mode */
+#define X86_EFLAGS_AC	0x00040000 /* Alignment Check */
+#define X86_EFLAGS_VIF	0x00080000 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
+
+#endif /* __ASM_X86_64_PROCESSOR_FLAGS_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 76552d72804c..2c1497de0b09 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
 #include <asm/percpu.h>
 #include <linux/personality.h>
 #include <linux/cpumask.h>
+#include <asm/processor-flags.h>
 
 #define TF_MASK		0x00000100
 #define IF_MASK		0x00000200
@@ -102,27 +103,6 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
 
-/*
- * EFLAGS bits
- */
-#define X86_EFLAGS_CF	0x00000001 /* Carry Flag */
-#define X86_EFLAGS_PF	0x00000004 /* Parity Flag */
-#define X86_EFLAGS_AF	0x00000010 /* Auxillary carry Flag */
-#define X86_EFLAGS_ZF	0x00000040 /* Zero Flag */
-#define X86_EFLAGS_SF	0x00000080 /* Sign Flag */
-#define X86_EFLAGS_TF	0x00000100 /* Trap Flag */
-#define X86_EFLAGS_IF	0x00000200 /* Interrupt Flag */
-#define X86_EFLAGS_DF	0x00000400 /* Direction Flag */
-#define X86_EFLAGS_OF	0x00000800 /* Overflow Flag */
-#define X86_EFLAGS_IOPL	0x00003000 /* IOPL mask */
-#define X86_EFLAGS_NT	0x00004000 /* Nested Task */
-#define X86_EFLAGS_RF	0x00010000 /* Resume Flag */
-#define X86_EFLAGS_VM	0x00020000 /* Virtual Mode */
-#define X86_EFLAGS_AC	0x00040000 /* Alignment Check */
-#define X86_EFLAGS_VIF	0x00080000 /* Virtual Interrupt Flag */
-#define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
-#define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
-
 /*
  * Intel CPU features in CR4
  */
-- 
cgit v1.2.3


From bbf30a1650be396b5467f769f4fbee715f16ec36 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 2 May 2007 19:27:12 +0200
Subject: [PATCH] x86-64: fix arithmetic in comment

The xmm space on x86_64 is 256 bytes.

Signed-off-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 2c1497de0b09..6a117349a5de 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -181,7 +181,7 @@ struct i387_fxsave_struct {
 	u32	mxcsr;
 	u32	mxcsr_mask;
 	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 128 bytes */
+	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
 	u32	padding[24];
 } __attribute__ ((aligned (16)));
 
-- 
cgit v1.2.3


From c169859d6dfc7471ef9f2dbd720936e17906a084 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:12 +0200
Subject: [PATCH] x86-64: Clean up asm-x86_64/bugs.h

Most of asm-x86_64/bugs.h is code which should be in a C file, so put it there.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/kernel/Makefile      |  2 +-
 arch/x86_64/kernel/bugs.c        | 28 ++++++++++++++++++++++++++++
 include/asm-x86_64/alternative.h |  1 +
 include/asm-x86_64/bugs.h        | 30 ++++--------------------------
 4 files changed, 34 insertions(+), 27 deletions(-)
 create mode 100644 arch/x86_64/kernel/bugs.c

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 6879b4f01e88..a613e13d0e75 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o
+		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_X86_MCE)		+= mce.o therm_throt.o
diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c
new file mode 100644
index 000000000000..131e541e3f7b
--- /dev/null
+++ b/arch/x86_64/kernel/bugs.c
@@ -0,0 +1,28 @@
+/*
+ *  arch/x86_64/kernel/bugs.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *  Copyright (C) 2000  SuSE
+ *
+ * This is included by init/main.c to check for architecture-dependent bugs.
+ *
+ * Needs:
+ *	void check_bugs(void);
+ */
+
+#include <linux/kernel.h>
+#include <asm/alternative.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/pda.h>
+
+void __init check_bugs(void)
+{
+	identify_cpu(&boot_cpu_data);
+#if !defined(CONFIG_SMP)
+	printk("CPU: ");
+	print_cpu_info(&boot_cpu_data);
+#endif
+	alternative_instructions();
+}
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index a6657b4f3e0e..67ebea3cc481 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -16,6 +16,7 @@ struct alt_instr {
 	u8  pad[5];
 };
 
+extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
 struct module;
diff --git a/include/asm-x86_64/bugs.h b/include/asm-x86_64/bugs.h
index d86c5dd689fa..b33dc04d8f42 100644
--- a/include/asm-x86_64/bugs.h
+++ b/include/asm-x86_64/bugs.h
@@ -1,28 +1,6 @@
-/*
- *  include/asm-x86_64/bugs.h
- *
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- *
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *	void check_bugs(void);
- */
+#ifndef _ASM_X86_64_BUGS_H
+#define _ASM_X86_64_BUGS_H
 
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/pda.h>
+void check_bugs(void);
 
-extern void alternative_instructions(void);
-
-static void __init check_bugs(void)
-{
-	identify_cpu(&boot_cpu_data);
-#if !defined(CONFIG_SMP)
-	printk("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
-	alternative_instructions(); 
-}
+#endif	/* _ASM_X86_64_BUGS_H */
-- 
cgit v1.2.3


From f039b754714a422959027cb18bb33760eb8153f0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:12 +0200
Subject: [PATCH] x86: Don't use MWAIT on AMD Family 10

It doesn't put the CPU into deeper sleep states, so it's better to use the standard
idle loop to save power. But allow to reenable it anyways for benchmarking.

I also removed the obsolete idle=halt on i386

Cc: andreas.herrmann@amd.com

Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/kernel-parameters.txt | 11 +++++++++--
 arch/i386/kernel/cpu/amd.c          |  5 +++++
 arch/i386/kernel/process.c          | 17 ++++++++---------
 arch/x86_64/kernel/process.c        | 12 +++++++-----
 arch/x86_64/kernel/setup.c          |  6 ++++++
 include/asm-i386/processor.h        |  2 ++
 include/asm-x86_64/proto.h          |  2 ++
 7 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4287696f18dd..94ce0d20253d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -695,8 +695,15 @@ and is between 256 and 4096 characters. It is defined in the file
 	idebus=		[HW] (E)IDE subsystem - VLB/PCI bus speed
 			See Documentation/ide.txt.
 
-	idle=		[HW]
-			Format: idle=poll or idle=halt
+	idle=		[X86]
+			Format: idle=poll or idle=mwait
+			Poll forces a polling idle loop that can slightly improves the performance
+			of waking up a idle CPU, but will use a lot of power and make the system
+			run hot. Not recommended.
+			idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose
+			to not use it because it doesn't save as much power as a normal idle
+			loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
+			as idle=poll.
 
 	ignore_loglevel	[KNL]
 			Ignore loglevel setting - this will print /all/
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 2d47db482972..197cda62caa3 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void)
 	return 0;
 }
 
+int force_mwait __cpuinitdata;
+
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (amd_apic_timer_broken())
 		set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
+
+	if (c->x86 == 0x10 && !force_mwait)
+		clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 393a67d5d943..7e8e129b3d7d 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -272,25 +272,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
 	}
 }
 
-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
 {
-	if (!strncmp(str, "poll", 4)) {
+	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
 #ifdef CONFIG_X86_SMP
 		if (smp_num_siblings > 1)
 			printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
 #endif
-	} else if (!strncmp(str, "halt", 4)) {
-		printk("using halt in idle threads.\n");
-		pm_idle = default_idle;
-	}
+	} else if (!strcmp(str, "mwait"))
+		force_mwait = 1;
+	else
+		return -1;
 
 	boot_option_idle_override = 1;
-	return 1;
+	return 0;
 }
-
-__setup("idle=", idle_setup);
+early_param("idle", idle_setup);
 
 void show_regs(struct pt_regs * regs)
 {
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index d8d5ccc245c8..4f21765078b7 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -288,16 +288,18 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 
 static int __init idle_setup (char *str)
 {
-	if (!strncmp(str, "poll", 4)) {
+	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
-	}
+	} else if (!strcmp(str, "mwait"))
+		force_mwait = 1;
+	else
+		return -1;
 
 	boot_option_idle_override = 1;
-	return 1;
+	return 0;
 }
-
-__setup("idle=", idle_setup);
+early_param("idle", idle_setup);
 
 /* Prints also some state that isn't saved in the pt_regs */ 
 void __show_regs(struct pt_regs * regs)
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0a1d539149df..db30b5bcef61 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -79,6 +79,8 @@ int bootloader_type;
 
 unsigned long saved_video_mode;
 
+int force_mwait __cpuinitdata;
+
 /* 
  * Early DMI memory
  */
@@ -604,6 +606,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	/* RDTSC can be speculated around */
 	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+
+	/* Family 10 doesn't support C states in MWAIT so don't use it */
+	if (c->x86 == 0x10 && !force_mwait)
+		clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 9d895cc2f312..882d3f8fbbac 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -779,4 +779,6 @@ extern int sysenter_setup(void);
 extern void cpu_set_gdt(int);
 extern void cpu_init(void);
 
+extern int force_mwait;
+
 #endif /* __ASM_I386_PROCESSOR_H */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 3f8f285138d2..98063bcb3b33 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -119,6 +119,8 @@ extern int gsi_irq_sharing(int gsi);
 
 extern void smp_local_timer_interrupt(void);
 
+extern int force_mwait;
+
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
 
 void i8254_timer_resume(void);
-- 
cgit v1.2.3


From 4bc5aa91fb1e544ad37805520030a0d9fc6e11d3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 2 May 2007 19:27:12 +0200
Subject: [PATCH] x86: Clean up x86 control register and MSR macros (corrected)

This patch is based on Rusty's recent cleanup of the EFLAGS-related
macros; it extends the same kind of cleanup to control registers and
MSRs.

It also unifies these between i386 and x86-64; at least with regards
to MSRs, the two had definitely gotten out of sync.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-i386/Kbuild              |   2 +
 include/asm-i386/msr-index.h         | 273 ++++++++++++++++++++++++++++++++++
 include/asm-i386/msr.h               | 237 +-----------------------------
 include/asm-i386/processor-flags.h   |  65 +++++++++
 include/asm-i386/processor.h         |  35 -----
 include/asm-x86_64/Kbuild            |   3 +-
 include/asm-x86_64/msr-index.h       |   1 +
 include/asm-x86_64/msr.h             | 274 +----------------------------------
 include/asm-x86_64/processor-flags.h |  27 +---
 include/asm-x86_64/processor.h       |  31 ----
 10 files changed, 356 insertions(+), 592 deletions(-)
 create mode 100644 include/asm-i386/msr-index.h
 create mode 100644 include/asm-x86_64/msr-index.h

(limited to 'include/asm-x86_64')

diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild
index 5ae93afc67e1..cbf6e8f1087b 100644
--- a/include/asm-i386/Kbuild
+++ b/include/asm-i386/Kbuild
@@ -3,8 +3,10 @@ include include/asm-generic/Kbuild.asm
 header-y += boot.h
 header-y += debugreg.h
 header-y += ldt.h
+header-y += msr-index.h
 header-y += ptrace-abi.h
 header-y += ucontext.h
 
+unifdef-y += msr.h
 unifdef-y += mtrr.h
 unifdef-y += vm86.h
diff --git a/include/asm-i386/msr-index.h b/include/asm-i386/msr-index.h
new file mode 100644
index 000000000000..f1190802283d
--- /dev/null
+++ b/include/asm-i386/msr-index.h
@@ -0,0 +1,273 @@
+#ifndef __ASM_MSR_INDEX_H
+#define __ASM_MSR_INDEX_H
+
+/* CPU model specific register (MSR) numbers */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER		0xc0000080 /* extended feature register */
+#define MSR_STAR		0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR		0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR		0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK	0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE		0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE		0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE	0xc0000102 /* SwapGS GS shadow */
+
+/* EFER bits: */
+#define _EFER_SCE		0  /* SYSCALL/SYSRET */
+#define _EFER_LME		8  /* Long mode enable */
+#define _EFER_LMA		10 /* Long mode active (read-only) */
+#define _EFER_NX		11 /* No execute enable */
+
+#define EFER_SCE		(1<<_EFER_SCE)
+#define EFER_LME		(1<<_EFER_LME)
+#define EFER_LMA		(1<<_EFER_LMA)
+#define EFER_NX			(1<<_EFER_NX)
+
+/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_PERFCTR0		0x000000c1
+#define MSR_IA32_PERFCTR1		0x000000c2
+#define MSR_FSB_FREQ			0x000000cd
+
+#define MSR_MTRRcap			0x000000fe
+#define MSR_IA32_BBL_CR_CTL		0x00000119
+
+#define MSR_IA32_SYSENTER_CS		0x00000174
+#define MSR_IA32_SYSENTER_ESP		0x00000175
+#define MSR_IA32_SYSENTER_EIP		0x00000176
+
+#define MSR_IA32_MCG_CAP		0x00000179
+#define MSR_IA32_MCG_STATUS		0x0000017a
+#define MSR_IA32_MCG_CTL		0x0000017b
+
+#define MSR_IA32_PEBS_ENABLE		0x000003f1
+#define MSR_IA32_DS_AREA		0x00000600
+#define MSR_IA32_PERF_CAPABILITIES	0x00000345
+
+#define MSR_MTRRfix64K_00000		0x00000250
+#define MSR_MTRRfix16K_80000		0x00000258
+#define MSR_MTRRfix16K_A0000		0x00000259
+#define MSR_MTRRfix4K_C0000		0x00000268
+#define MSR_MTRRfix4K_C8000		0x00000269
+#define MSR_MTRRfix4K_D0000		0x0000026a
+#define MSR_MTRRfix4K_D8000		0x0000026b
+#define MSR_MTRRfix4K_E0000		0x0000026c
+#define MSR_MTRRfix4K_E8000		0x0000026d
+#define MSR_MTRRfix4K_F0000		0x0000026e
+#define MSR_MTRRfix4K_F8000		0x0000026f
+#define MSR_MTRRdefType			0x000002ff
+
+#define MSR_IA32_DEBUGCTLMSR		0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP	0x000001db
+#define MSR_IA32_LASTBRANCHTOIP		0x000001dc
+#define MSR_IA32_LASTINTFROMIP		0x000001dd
+#define MSR_IA32_LASTINTTOIP		0x000001de
+
+#define MSR_IA32_MC0_CTL		0x00000400
+#define MSR_IA32_MC0_STATUS		0x00000401
+#define MSR_IA32_MC0_ADDR		0x00000402
+#define MSR_IA32_MC0_MISC		0x00000403
+
+#define MSR_P6_PERFCTR0			0x000000c1
+#define MSR_P6_PERFCTR1			0x000000c2
+#define MSR_P6_EVNTSEL0			0x00000186
+#define MSR_P6_EVNTSEL1			0x00000187
+
+/* K7/K8 MSRs. Not complete. See the architecture manual for a more
+   complete list. */
+#define MSR_K7_EVNTSEL0			0xc0010000
+#define MSR_K7_PERFCTR0			0xc0010004
+#define MSR_K7_EVNTSEL1			0xc0010001
+#define MSR_K7_PERFCTR1			0xc0010005
+#define MSR_K7_EVNTSEL2			0xc0010002
+#define MSR_K7_PERFCTR2			0xc0010006
+#define MSR_K7_EVNTSEL3			0xc0010003
+#define MSR_K7_PERFCTR3			0xc0010007
+#define MSR_K8_TOP_MEM1			0xc001001a
+#define MSR_K7_CLK_CTL			0xc001001b
+#define MSR_K8_TOP_MEM2			0xc001001d
+#define MSR_K8_SYSCFG			0xc0010010
+#define MSR_K7_HWCR			0xc0010015
+#define MSR_K8_HWCR			0xc0010015
+#define MSR_K7_FID_VID_CTL		0xc0010041
+#define MSR_K7_FID_VID_STATUS		0xc0010042
+#define MSR_K8_ENABLE_C1E		0xc0010055
+
+/* K6 MSRs */
+#define MSR_K6_EFER			0xc0000080
+#define MSR_K6_STAR			0xc0000081
+#define MSR_K6_WHCR			0xc0000082
+#define MSR_K6_UWCCR			0xc0000085
+#define MSR_K6_EPMR			0xc0000086
+#define MSR_K6_PSOR			0xc0000087
+#define MSR_K6_PFIR			0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1			0x00000107
+#define MSR_IDT_FCR2			0x00000108
+#define MSR_IDT_FCR3			0x00000109
+#define MSR_IDT_FCR4			0x0000010a
+
+#define MSR_IDT_MCR0			0x00000110
+#define MSR_IDT_MCR1			0x00000111
+#define MSR_IDT_MCR2			0x00000112
+#define MSR_IDT_MCR3			0x00000113
+#define MSR_IDT_MCR4			0x00000114
+#define MSR_IDT_MCR5			0x00000115
+#define MSR_IDT_MCR6			0x00000116
+#define MSR_IDT_MCR7			0x00000117
+#define MSR_IDT_MCR_CTRL		0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR			0x00001107
+#define MSR_VIA_LONGHAUL		0x0000110a
+#define MSR_VIA_RNG			0x0000110b
+#define MSR_VIA_BCR2			0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL		0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS		0x80868011
+#define MSR_TMTA_LRTI_READOUT		0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ		0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR		0x00000000
+#define MSR_IA32_P5_MC_TYPE		0x00000001
+#define MSR_IA32_TSC			0x00000010
+#define MSR_IA32_PLATFORM_ID		0x00000017
+#define MSR_IA32_EBL_CR_POWERON		0x0000002a
+
+#define MSR_IA32_APICBASE		0x0000001b
+#define MSR_IA32_APICBASE_BSP		(1<<8)
+#define MSR_IA32_APICBASE_ENABLE	(1<<11)
+#define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
+
+#define MSR_IA32_UCODE_WRITE		0x00000079
+#define MSR_IA32_UCODE_REV		0x0000008b
+
+#define MSR_IA32_PERF_STATUS		0x00000198
+#define MSR_IA32_PERF_CTL		0x00000199
+
+#define MSR_IA32_MPERF			0x000000e7
+#define MSR_IA32_APERF			0x000000e8
+
+#define MSR_IA32_THERM_CONTROL		0x0000019a
+#define MSR_IA32_THERM_INTERRUPT	0x0000019b
+#define MSR_IA32_THERM_STATUS		0x0000019c
+#define MSR_IA32_MISC_ENABLE		0x000001a0
+
+/* Intel Model 6 */
+#define MSR_P6_EVNTSEL0			0x00000186
+#define MSR_P6_EVNTSEL1			0x00000187
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX		0x00000180
+#define MSR_IA32_MCG_EBX		0x00000181
+#define MSR_IA32_MCG_ECX		0x00000182
+#define MSR_IA32_MCG_EDX		0x00000183
+#define MSR_IA32_MCG_ESI		0x00000184
+#define MSR_IA32_MCG_EDI		0x00000185
+#define MSR_IA32_MCG_EBP		0x00000186
+#define MSR_IA32_MCG_ESP		0x00000187
+#define MSR_IA32_MCG_EFLAGS		0x00000188
+#define MSR_IA32_MCG_EIP		0x00000189
+#define MSR_IA32_MCG_RESERVED		0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0		0x00000300
+#define MSR_P4_BPU_PERFCTR1		0x00000301
+#define MSR_P4_BPU_PERFCTR2		0x00000302
+#define MSR_P4_BPU_PERFCTR3		0x00000303
+#define MSR_P4_MS_PERFCTR0		0x00000304
+#define MSR_P4_MS_PERFCTR1		0x00000305
+#define MSR_P4_MS_PERFCTR2		0x00000306
+#define MSR_P4_MS_PERFCTR3		0x00000307
+#define MSR_P4_FLAME_PERFCTR0		0x00000308
+#define MSR_P4_FLAME_PERFCTR1		0x00000309
+#define MSR_P4_FLAME_PERFCTR2		0x0000030a
+#define MSR_P4_FLAME_PERFCTR3		0x0000030b
+#define MSR_P4_IQ_PERFCTR0		0x0000030c
+#define MSR_P4_IQ_PERFCTR1		0x0000030d
+#define MSR_P4_IQ_PERFCTR2		0x0000030e
+#define MSR_P4_IQ_PERFCTR3		0x0000030f
+#define MSR_P4_IQ_PERFCTR4		0x00000310
+#define MSR_P4_IQ_PERFCTR5		0x00000311
+#define MSR_P4_BPU_CCCR0		0x00000360
+#define MSR_P4_BPU_CCCR1		0x00000361
+#define MSR_P4_BPU_CCCR2		0x00000362
+#define MSR_P4_BPU_CCCR3		0x00000363
+#define MSR_P4_MS_CCCR0			0x00000364
+#define MSR_P4_MS_CCCR1			0x00000365
+#define MSR_P4_MS_CCCR2			0x00000366
+#define MSR_P4_MS_CCCR3			0x00000367
+#define MSR_P4_FLAME_CCCR0		0x00000368
+#define MSR_P4_FLAME_CCCR1		0x00000369
+#define MSR_P4_FLAME_CCCR2		0x0000036a
+#define MSR_P4_FLAME_CCCR3		0x0000036b
+#define MSR_P4_IQ_CCCR0			0x0000036c
+#define MSR_P4_IQ_CCCR1			0x0000036d
+#define MSR_P4_IQ_CCCR2			0x0000036e
+#define MSR_P4_IQ_CCCR3			0x0000036f
+#define MSR_P4_IQ_CCCR4			0x00000370
+#define MSR_P4_IQ_CCCR5			0x00000371
+#define MSR_P4_ALF_ESCR0		0x000003ca
+#define MSR_P4_ALF_ESCR1		0x000003cb
+#define MSR_P4_BPU_ESCR0		0x000003b2
+#define MSR_P4_BPU_ESCR1		0x000003b3
+#define MSR_P4_BSU_ESCR0		0x000003a0
+#define MSR_P4_BSU_ESCR1		0x000003a1
+#define MSR_P4_CRU_ESCR0		0x000003b8
+#define MSR_P4_CRU_ESCR1		0x000003b9
+#define MSR_P4_CRU_ESCR2		0x000003cc
+#define MSR_P4_CRU_ESCR3		0x000003cd
+#define MSR_P4_CRU_ESCR4		0x000003e0
+#define MSR_P4_CRU_ESCR5		0x000003e1
+#define MSR_P4_DAC_ESCR0		0x000003a8
+#define MSR_P4_DAC_ESCR1		0x000003a9
+#define MSR_P4_FIRM_ESCR0		0x000003a4
+#define MSR_P4_FIRM_ESCR1		0x000003a5
+#define MSR_P4_FLAME_ESCR0		0x000003a6
+#define MSR_P4_FLAME_ESCR1		0x000003a7
+#define MSR_P4_FSB_ESCR0		0x000003a2
+#define MSR_P4_FSB_ESCR1		0x000003a3
+#define MSR_P4_IQ_ESCR0			0x000003ba
+#define MSR_P4_IQ_ESCR1			0x000003bb
+#define MSR_P4_IS_ESCR0			0x000003b4
+#define MSR_P4_IS_ESCR1			0x000003b5
+#define MSR_P4_ITLB_ESCR0		0x000003b6
+#define MSR_P4_ITLB_ESCR1		0x000003b7
+#define MSR_P4_IX_ESCR0			0x000003c8
+#define MSR_P4_IX_ESCR1			0x000003c9
+#define MSR_P4_MOB_ESCR0		0x000003aa
+#define MSR_P4_MOB_ESCR1		0x000003ab
+#define MSR_P4_MS_ESCR0			0x000003c0
+#define MSR_P4_MS_ESCR1			0x000003c1
+#define MSR_P4_PMH_ESCR0		0x000003ac
+#define MSR_P4_PMH_ESCR1		0x000003ad
+#define MSR_P4_RAT_ESCR0		0x000003bc
+#define MSR_P4_RAT_ESCR1		0x000003bd
+#define MSR_P4_SAAT_ESCR0		0x000003ae
+#define MSR_P4_SAAT_ESCR1		0x000003af
+#define MSR_P4_SSU_ESCR0		0x000003be
+#define MSR_P4_SSU_ESCR1		0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0		0x000003c2
+#define MSR_P4_TBPU_ESCR1		0x000003c3
+#define MSR_P4_TC_ESCR0			0x000003c4
+#define MSR_P4_TC_ESCR1			0x000003c5
+#define MSR_P4_U2L_ESCR0		0x000003b0
+#define MSR_P4_U2L_ESCR1		0x000003b1
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0	0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1	0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2	0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL	0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS	0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL	0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL	0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0		0x00001900
+
+#endif /* __ASM_MSR_INDEX_H */
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 00acaa8b36bb..9559894c7658 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,6 +1,11 @@
 #ifndef __ASM_MSR_H
 #define __ASM_MSR_H
 
+#include <asm/msr-index.h>
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
 #include <asm/errno.h>
 
 static inline unsigned long long native_read_msr(unsigned int msr)
@@ -153,234 +158,6 @@ static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 	wrmsr(msr_no, l, h);
 }
 #endif  /*  CONFIG_SMP  */
-
-/* symbolic names for some interesting MSRs */
-/* Intel defined MSRs. */
-#define MSR_IA32_P5_MC_ADDR		0
-#define MSR_IA32_P5_MC_TYPE		1
-#define MSR_IA32_PLATFORM_ID		0x17
-#define MSR_IA32_EBL_CR_POWERON		0x2a
-
-#define MSR_IA32_APICBASE		0x1b
-#define MSR_IA32_APICBASE_BSP		(1<<8)
-#define MSR_IA32_APICBASE_ENABLE	(1<<11)
-#define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
-
-#define MSR_IA32_UCODE_WRITE		0x79
-#define MSR_IA32_UCODE_REV		0x8b
-
-#define MSR_P6_PERFCTR0		0xc1
-#define MSR_P6_PERFCTR1		0xc2
-#define MSR_FSB_FREQ		0xcd
-
-
-#define MSR_IA32_BBL_CR_CTL		0x119
-
-#define MSR_IA32_SYSENTER_CS		0x174
-#define MSR_IA32_SYSENTER_ESP		0x175
-#define MSR_IA32_SYSENTER_EIP		0x176
-
-#define MSR_IA32_MCG_CAP		0x179
-#define MSR_IA32_MCG_STATUS		0x17a
-#define MSR_IA32_MCG_CTL		0x17b
-
-/* P4/Xeon+ specific */
-#define MSR_IA32_MCG_EAX		0x180
-#define MSR_IA32_MCG_EBX		0x181
-#define MSR_IA32_MCG_ECX		0x182
-#define MSR_IA32_MCG_EDX		0x183
-#define MSR_IA32_MCG_ESI		0x184
-#define MSR_IA32_MCG_EDI		0x185
-#define MSR_IA32_MCG_EBP		0x186
-#define MSR_IA32_MCG_ESP		0x187
-#define MSR_IA32_MCG_EFLAGS		0x188
-#define MSR_IA32_MCG_EIP		0x189
-#define MSR_IA32_MCG_RESERVED		0x18A
-
-#define MSR_P6_EVNTSEL0			0x186
-#define MSR_P6_EVNTSEL1			0x187
-
-#define MSR_IA32_PERF_STATUS		0x198
-#define MSR_IA32_PERF_CTL		0x199
-
-#define MSR_IA32_MPERF			0xE7
-#define MSR_IA32_APERF			0xE8
-
-#define MSR_IA32_THERM_CONTROL		0x19a
-#define MSR_IA32_THERM_INTERRUPT	0x19b
-#define MSR_IA32_THERM_STATUS		0x19c
-#define MSR_IA32_MISC_ENABLE		0x1a0
-
-#define MSR_IA32_DEBUGCTLMSR		0x1d9
-#define MSR_IA32_LASTBRANCHFROMIP	0x1db
-#define MSR_IA32_LASTBRANCHTOIP		0x1dc
-#define MSR_IA32_LASTINTFROMIP		0x1dd
-#define MSR_IA32_LASTINTTOIP		0x1de
-
-#define MSR_IA32_MC0_CTL		0x400
-#define MSR_IA32_MC0_STATUS		0x401
-#define MSR_IA32_MC0_ADDR		0x402
-#define MSR_IA32_MC0_MISC		0x403
-
-#define MSR_IA32_PEBS_ENABLE		0x3f1
-#define MSR_IA32_DS_AREA		0x600
-#define MSR_IA32_PERF_CAPABILITIES	0x345
-
-/* Pentium IV performance counter MSRs */
-#define MSR_P4_BPU_PERFCTR0 		0x300
-#define MSR_P4_BPU_PERFCTR1 		0x301
-#define MSR_P4_BPU_PERFCTR2 		0x302
-#define MSR_P4_BPU_PERFCTR3 		0x303
-#define MSR_P4_MS_PERFCTR0 		0x304
-#define MSR_P4_MS_PERFCTR1 		0x305
-#define MSR_P4_MS_PERFCTR2 		0x306
-#define MSR_P4_MS_PERFCTR3 		0x307
-#define MSR_P4_FLAME_PERFCTR0 		0x308
-#define MSR_P4_FLAME_PERFCTR1 		0x309
-#define MSR_P4_FLAME_PERFCTR2 		0x30a
-#define MSR_P4_FLAME_PERFCTR3 		0x30b
-#define MSR_P4_IQ_PERFCTR0 		0x30c
-#define MSR_P4_IQ_PERFCTR1 		0x30d
-#define MSR_P4_IQ_PERFCTR2 		0x30e
-#define MSR_P4_IQ_PERFCTR3 		0x30f
-#define MSR_P4_IQ_PERFCTR4 		0x310
-#define MSR_P4_IQ_PERFCTR5 		0x311
-#define MSR_P4_BPU_CCCR0 		0x360
-#define MSR_P4_BPU_CCCR1 		0x361
-#define MSR_P4_BPU_CCCR2 		0x362
-#define MSR_P4_BPU_CCCR3 		0x363
-#define MSR_P4_MS_CCCR0 		0x364
-#define MSR_P4_MS_CCCR1 		0x365
-#define MSR_P4_MS_CCCR2 		0x366
-#define MSR_P4_MS_CCCR3 		0x367
-#define MSR_P4_FLAME_CCCR0 		0x368
-#define MSR_P4_FLAME_CCCR1 		0x369
-#define MSR_P4_FLAME_CCCR2 		0x36a
-#define MSR_P4_FLAME_CCCR3 		0x36b
-#define MSR_P4_IQ_CCCR0 		0x36c
-#define MSR_P4_IQ_CCCR1 		0x36d
-#define MSR_P4_IQ_CCCR2 		0x36e
-#define MSR_P4_IQ_CCCR3 		0x36f
-#define MSR_P4_IQ_CCCR4 		0x370
-#define MSR_P4_IQ_CCCR5 		0x371
-#define MSR_P4_ALF_ESCR0 		0x3ca
-#define MSR_P4_ALF_ESCR1 		0x3cb
-#define MSR_P4_BPU_ESCR0 		0x3b2
-#define MSR_P4_BPU_ESCR1 		0x3b3
-#define MSR_P4_BSU_ESCR0 		0x3a0
-#define MSR_P4_BSU_ESCR1 		0x3a1
-#define MSR_P4_CRU_ESCR0 		0x3b8
-#define MSR_P4_CRU_ESCR1 		0x3b9
-#define MSR_P4_CRU_ESCR2 		0x3cc
-#define MSR_P4_CRU_ESCR3 		0x3cd
-#define MSR_P4_CRU_ESCR4 		0x3e0
-#define MSR_P4_CRU_ESCR5 		0x3e1
-#define MSR_P4_DAC_ESCR0 		0x3a8
-#define MSR_P4_DAC_ESCR1 		0x3a9
-#define MSR_P4_FIRM_ESCR0 		0x3a4
-#define MSR_P4_FIRM_ESCR1 		0x3a5
-#define MSR_P4_FLAME_ESCR0 		0x3a6
-#define MSR_P4_FLAME_ESCR1 		0x3a7
-#define MSR_P4_FSB_ESCR0 		0x3a2
-#define MSR_P4_FSB_ESCR1 		0x3a3
-#define MSR_P4_IQ_ESCR0 		0x3ba
-#define MSR_P4_IQ_ESCR1 		0x3bb
-#define MSR_P4_IS_ESCR0 		0x3b4
-#define MSR_P4_IS_ESCR1 		0x3b5
-#define MSR_P4_ITLB_ESCR0 		0x3b6
-#define MSR_P4_ITLB_ESCR1 		0x3b7
-#define MSR_P4_IX_ESCR0 		0x3c8
-#define MSR_P4_IX_ESCR1 		0x3c9
-#define MSR_P4_MOB_ESCR0 		0x3aa
-#define MSR_P4_MOB_ESCR1 		0x3ab
-#define MSR_P4_MS_ESCR0 		0x3c0
-#define MSR_P4_MS_ESCR1 		0x3c1
-#define MSR_P4_PMH_ESCR0 		0x3ac
-#define MSR_P4_PMH_ESCR1 		0x3ad
-#define MSR_P4_RAT_ESCR0 		0x3bc
-#define MSR_P4_RAT_ESCR1 		0x3bd
-#define MSR_P4_SAAT_ESCR0 		0x3ae
-#define MSR_P4_SAAT_ESCR1 		0x3af
-#define MSR_P4_SSU_ESCR0 		0x3be
-#define MSR_P4_SSU_ESCR1 		0x3bf    /* guess: not defined in manual */
-#define MSR_P4_TBPU_ESCR0 		0x3c2
-#define MSR_P4_TBPU_ESCR1 		0x3c3
-#define MSR_P4_TC_ESCR0 		0x3c4
-#define MSR_P4_TC_ESCR1 		0x3c5
-#define MSR_P4_U2L_ESCR0 		0x3b0
-#define MSR_P4_U2L_ESCR1 		0x3b1
-
-/* AMD Defined MSRs */
-#define MSR_K6_EFER			0xC0000080
-#define MSR_K6_STAR			0xC0000081
-#define MSR_K6_WHCR			0xC0000082
-#define MSR_K6_UWCCR			0xC0000085
-#define MSR_K6_EPMR			0xC0000086
-#define MSR_K6_PSOR			0xC0000087
-#define MSR_K6_PFIR			0xC0000088
-
-#define MSR_K7_EVNTSEL0			0xC0010000
-#define MSR_K7_EVNTSEL1			0xC0010001
-#define MSR_K7_EVNTSEL2			0xC0010002
-#define MSR_K7_EVNTSEL3			0xC0010003
-#define MSR_K7_PERFCTR0			0xC0010004
-#define MSR_K7_PERFCTR1			0xC0010005
-#define MSR_K7_PERFCTR2			0xC0010006
-#define MSR_K7_PERFCTR3			0xC0010007
-#define MSR_K7_HWCR			0xC0010015
-#define MSR_K7_CLK_CTL			0xC001001b
-#define MSR_K7_FID_VID_CTL		0xC0010041
-#define MSR_K7_FID_VID_STATUS		0xC0010042
-
-#define MSR_K8_ENABLE_C1E		0xC0010055
-
-/* extended feature register */
-#define MSR_EFER 			0xc0000080
-
-/* EFER bits: */
-
-/* Execute Disable enable */
-#define _EFER_NX			11
-#define EFER_NX				(1<<_EFER_NX)
-
-/* Centaur-Hauls/IDT defined MSRs. */
-#define MSR_IDT_FCR1			0x107
-#define MSR_IDT_FCR2			0x108
-#define MSR_IDT_FCR3			0x109
-#define MSR_IDT_FCR4			0x10a
-
-#define MSR_IDT_MCR0			0x110
-#define MSR_IDT_MCR1			0x111
-#define MSR_IDT_MCR2			0x112
-#define MSR_IDT_MCR3			0x113
-#define MSR_IDT_MCR4			0x114
-#define MSR_IDT_MCR5			0x115
-#define MSR_IDT_MCR6			0x116
-#define MSR_IDT_MCR7			0x117
-#define MSR_IDT_MCR_CTRL		0x120
-
-/* VIA Cyrix defined MSRs*/
-#define MSR_VIA_FCR			0x1107
-#define MSR_VIA_LONGHAUL		0x110a
-#define MSR_VIA_RNG			0x110b
-#define MSR_VIA_BCR2			0x1147
-
-/* Transmeta defined MSRs */
-#define MSR_TMTA_LONGRUN_CTRL		0x80868010
-#define MSR_TMTA_LONGRUN_FLAGS		0x80868011
-#define MSR_TMTA_LRTI_READOUT		0x80868018
-#define MSR_TMTA_LRTI_VOLT_MHZ		0x8086801a
-
-/* Intel Core-based CPU performance counters */
-#define MSR_CORE_PERF_FIXED_CTR0	0x309
-#define MSR_CORE_PERF_FIXED_CTR1	0x30a
-#define MSR_CORE_PERF_FIXED_CTR2	0x30b
-#define MSR_CORE_PERF_FIXED_CTR_CTRL	0x38d
-#define MSR_CORE_PERF_GLOBAL_STATUS	0x38e
-#define MSR_CORE_PERF_GLOBAL_CTRL	0x38f
-#define MSR_CORE_PERF_GLOBAL_OVF_CTRL	0x390
-
-/* Geode defined MSRs */
-#define MSR_GEODE_BUSCONT_CONF0         0x1900
-
+#endif
+#endif
 #endif /* __ASM_MSR_H */
diff --git a/include/asm-i386/processor-flags.h b/include/asm-i386/processor-flags.h
index b4711c222e2b..5404e90edd57 100644
--- a/include/asm-i386/processor-flags.h
+++ b/include/asm-i386/processor-flags.h
@@ -23,4 +23,69 @@
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE	0x00000001 /* Protection Enable */
+#define X86_CR0_MP	0x00000002 /* Monitor Coprocessor */
+#define X86_CR0_EM	0x00000004 /* Emulation */
+#define X86_CR0_TS	0x00000008 /* Task Switched */
+#define X86_CR0_ET	0x00000010 /* Extension Type */
+#define X86_CR0_NE	0x00000020 /* Numeric Error */
+#define X86_CR0_WP	0x00010000 /* Write Protect */
+#define X86_CR0_AM	0x00040000 /* Alignment Mask */
+#define X86_CR0_NW	0x20000000 /* Not Write-through */
+#define X86_CR0_CD	0x40000000 /* Cache Disable */
+#define X86_CR0_PG	0x80000000 /* Paging */
+
+/*
+ * Paging options in CR3
+ */
+#define X86_CR3_PWT	0x00000008 /* Page Write Through */
+#define X86_CR3_PCD	0x00000010 /* Page Cache Disable */
+
+/*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME	0x00000001 /* enable vm86 extensions */
+#define X86_CR4_PVI	0x00000002 /* virtual interrupts flag enable */
+#define X86_CR4_TSD	0x00000004 /* disable time stamp at ipl 3 */
+#define X86_CR4_DE	0x00000008 /* enable debugging extensions */
+#define X86_CR4_PSE	0x00000010 /* enable page size extensions */
+#define X86_CR4_PAE	0x00000020 /* enable physical address extensions */
+#define X86_CR4_MCE	0x00000040 /* Machine check enable */
+#define X86_CR4_PGE	0x00000080 /* enable global pages */
+#define X86_CR4_PCE	0x00000100 /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR	0x00000200 /* enable fast FPU save and restore */
+#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
+
+/*
+ * x86-64 Task Priority Register, CR8
+ */
+#define X86_CR8_TPR	0x00000007 /* task priority register */
+
+/*
+ * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ */
+
+/*
+ *      NSC/Cyrix CPU configuration register indexes
+ */
+#define CX86_PCR0	0x20
+#define CX86_GCR	0xb8
+#define CX86_CCR0	0xc0
+#define CX86_CCR1	0xc1
+#define CX86_CCR2	0xc2
+#define CX86_CCR3	0xc3
+#define CX86_CCR4	0xe8
+#define CX86_CCR5	0xe9
+#define CX86_CCR6	0xea
+#define CX86_CCR7	0xeb
+#define CX86_PCR1	0xf0
+#define CX86_DIR0	0xfe
+#define CX86_DIR1	0xff
+#define CX86_ARR_BASE	0xc4
+#define CX86_RCR_BASE	0xdc
+
 #endif	/* __ASM_I386_PROCESSOR_FLAGS_H */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 882d3f8fbbac..77e263267aa6 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -142,21 +142,6 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
 
 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
 
-/*
- * Intel CPU features in CR4
- */
-#define X86_CR4_VME		0x0001	/* enable vm86 extensions */
-#define X86_CR4_PVI		0x0002	/* virtual interrupts flag enable */
-#define X86_CR4_TSD		0x0004	/* disable time stamp at ipl 3 */
-#define X86_CR4_DE		0x0008	/* enable debugging extensions */
-#define X86_CR4_PSE		0x0010	/* enable page size extensions */
-#define X86_CR4_PAE		0x0020	/* enable physical address extensions */
-#define X86_CR4_MCE		0x0040	/* Machine check enable */
-#define X86_CR4_PGE		0x0080	/* enable global pages */
-#define X86_CR4_PCE		0x0100	/* enable performance counters at ipl 3 */
-#define X86_CR4_OSFXSR		0x0200	/* enable fast FPU save and restore */
-#define X86_CR4_OSXMMEXCPT	0x0400	/* enable unmasked SSE exceptions */
-
 /*
  * Save the cr4 feature set we're using (ie
  * Pentium 4MB enable and PPro Global page
@@ -183,26 +168,6 @@ static inline void clear_in_cr4 (unsigned long mask)
 	write_cr4(cr4);
 }
 
-/*
- *      NSC/Cyrix CPU configuration register indexes
- */
-
-#define CX86_PCR0 0x20
-#define CX86_GCR  0xb8
-#define CX86_CCR0 0xc0
-#define CX86_CCR1 0xc1
-#define CX86_CCR2 0xc2
-#define CX86_CCR3 0xc3
-#define CX86_CCR4 0xe8
-#define CX86_CCR5 0xe9
-#define CX86_CCR6 0xea
-#define CX86_CCR7 0xeb
-#define CX86_PCR1 0xf0
-#define CX86_DIR0 0xfe
-#define CX86_DIR1 0xff
-#define CX86_ARR_BASE 0xc4
-#define CX86_RCR_BASE 0xdc
-
 /*
  *      NSC/Cyrix CPU indexed register access macros
  */
diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
index 242296ede3dd..89ad1fc27c8b 100644
--- a/include/asm-x86_64/Kbuild
+++ b/include/asm-x86_64/Kbuild
@@ -8,7 +8,7 @@ header-y += boot.h
 header-y += bootsetup.h
 header-y += debugreg.h
 header-y += ldt.h
-header-y += msr.h
+header-y += msr-index.h
 header-y += prctl.h
 header-y += ptrace-abi.h
 header-y += sigcontext32.h
@@ -16,6 +16,7 @@ header-y += ucontext.h
 header-y += vsyscall32.h
 
 unifdef-y += mce.h
+unifdef-y += msr.h
 unifdef-y += mtrr.h
 unifdef-y += vsyscall.h
 unifdef-y += const.h
diff --git a/include/asm-x86_64/msr-index.h b/include/asm-x86_64/msr-index.h
new file mode 100644
index 000000000000..d77a63f1ddf2
--- /dev/null
+++ b/include/asm-x86_64/msr-index.h
@@ -0,0 +1 @@
+#include <asm-i386/msr-index.h>
diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 902f9a58617e..a524f0325673 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -1,6 +1,8 @@
 #ifndef X86_64_MSR_H
 #define X86_64_MSR_H 1
 
+#include <asm/msr-index.h>
+
 #ifndef __ASSEMBLY__
 /*
  * Access to machine-specific registers (available on 586 and better only)
@@ -157,9 +159,6 @@ static inline unsigned int cpuid_edx(unsigned int op)
 	return edx;
 }
 
-#define MSR_IA32_UCODE_WRITE		0x79
-#define MSR_IA32_UCODE_REV		0x8b
-
 #ifdef CONFIG_SMP
 void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
@@ -172,269 +171,6 @@ static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	wrmsr(msr_no, l, h);
 }
-#endif  /*  CONFIG_SMP  */
-
-#endif
-
-/* AMD/K8 specific MSRs */ 
-#define MSR_EFER 0xc0000080		/* extended feature register */
-#define MSR_STAR 0xc0000081		/* legacy mode SYSCALL target */
-#define MSR_LSTAR 0xc0000082 		/* long mode SYSCALL target */
-#define MSR_CSTAR 0xc0000083		/* compatibility mode SYSCALL target */
-#define MSR_SYSCALL_MASK 0xc0000084	/* EFLAGS mask for syscall */
-#define MSR_FS_BASE 0xc0000100		/* 64bit FS base */
-#define MSR_GS_BASE 0xc0000101		/* 64bit GS base */
-#define MSR_KERNEL_GS_BASE  0xc0000102	/* SwapGS GS shadow (or USER_GS from kernel) */ 
-/* EFER bits: */ 
-#define _EFER_SCE 0  /* SYSCALL/SYSRET */
-#define _EFER_LME 8  /* Long mode enable */
-#define _EFER_LMA 10 /* Long mode active (read-only) */
-#define _EFER_NX 11  /* No execute enable */
-
-#define EFER_SCE (1<<_EFER_SCE)
-#define EFER_LME (1<<_EFER_LME)
-#define EFER_LMA (1<<_EFER_LMA)
-#define EFER_NX (1<<_EFER_NX)
-
-/* Intel MSRs. Some also available on other CPUs */
-#define MSR_IA32_TSC		0x10
-#define MSR_IA32_PLATFORM_ID	0x17
-
-#define MSR_IA32_PERFCTR0      0xc1
-#define MSR_IA32_PERFCTR1      0xc2
-#define MSR_FSB_FREQ		0xcd
-
-#define MSR_MTRRcap		0x0fe
-#define MSR_IA32_BBL_CR_CTL        0x119
-
-#define MSR_IA32_SYSENTER_CS	0x174
-#define MSR_IA32_SYSENTER_ESP	0x175
-#define MSR_IA32_SYSENTER_EIP	0x176
-
-#define MSR_IA32_MCG_CAP       0x179
-#define MSR_IA32_MCG_STATUS        0x17a
-#define MSR_IA32_MCG_CTL       0x17b
-
-#define MSR_IA32_EVNTSEL0      0x186
-#define MSR_IA32_EVNTSEL1      0x187
-
-#define MSR_IA32_DEBUGCTLMSR       0x1d9
-#define MSR_IA32_LASTBRANCHFROMIP  0x1db
-#define MSR_IA32_LASTBRANCHTOIP        0x1dc
-#define MSR_IA32_LASTINTFROMIP     0x1dd
-#define MSR_IA32_LASTINTTOIP       0x1de
-
-#define MSR_IA32_PEBS_ENABLE		0x3f1
-#define MSR_IA32_DS_AREA		0x600
-#define MSR_IA32_PERF_CAPABILITIES	0x345
-
-#define MSR_MTRRfix64K_00000	0x250
-#define MSR_MTRRfix16K_80000	0x258
-#define MSR_MTRRfix16K_A0000	0x259
-#define MSR_MTRRfix4K_C0000	0x268
-#define MSR_MTRRfix4K_C8000	0x269
-#define MSR_MTRRfix4K_D0000	0x26a
-#define MSR_MTRRfix4K_D8000	0x26b
-#define MSR_MTRRfix4K_E0000	0x26c
-#define MSR_MTRRfix4K_E8000	0x26d
-#define MSR_MTRRfix4K_F0000	0x26e
-#define MSR_MTRRfix4K_F8000	0x26f
-#define MSR_MTRRdefType		0x2ff
-
-#define MSR_IA32_MC0_CTL       0x400
-#define MSR_IA32_MC0_STATUS        0x401
-#define MSR_IA32_MC0_ADDR      0x402
-#define MSR_IA32_MC0_MISC      0x403
-
-#define MSR_P6_PERFCTR0			0xc1
-#define MSR_P6_PERFCTR1			0xc2
-#define MSR_P6_EVNTSEL0			0x186
-#define MSR_P6_EVNTSEL1			0x187
-
-/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
-#define MSR_K7_EVNTSEL0            0xC0010000
-#define MSR_K7_PERFCTR0            0xC0010004
-#define MSR_K7_EVNTSEL1            0xC0010001
-#define MSR_K7_PERFCTR1            0xC0010005
-#define MSR_K7_EVNTSEL2            0xC0010002
-#define MSR_K7_PERFCTR2            0xC0010006
-#define MSR_K7_EVNTSEL3            0xC0010003
-#define MSR_K7_PERFCTR3            0xC0010007
-#define MSR_K8_TOP_MEM1		   0xC001001A
-#define MSR_K8_TOP_MEM2		   0xC001001D
-#define MSR_K8_SYSCFG		   0xC0010010
-#define MSR_K8_HWCR		   0xC0010015
-
-/* K6 MSRs */
-#define MSR_K6_EFER			0xC0000080
-#define MSR_K6_STAR			0xC0000081
-#define MSR_K6_WHCR			0xC0000082
-#define MSR_K6_UWCCR			0xC0000085
-#define MSR_K6_PSOR			0xC0000087
-#define MSR_K6_PFIR			0xC0000088
-
-/* Centaur-Hauls/IDT defined MSRs. */
-#define MSR_IDT_FCR1			0x107
-#define MSR_IDT_FCR2			0x108
-#define MSR_IDT_FCR3			0x109
-#define MSR_IDT_FCR4			0x10a
-
-#define MSR_IDT_MCR0			0x110
-#define MSR_IDT_MCR1			0x111
-#define MSR_IDT_MCR2			0x112
-#define MSR_IDT_MCR3			0x113
-#define MSR_IDT_MCR4			0x114
-#define MSR_IDT_MCR5			0x115
-#define MSR_IDT_MCR6			0x116
-#define MSR_IDT_MCR7			0x117
-#define MSR_IDT_MCR_CTRL		0x120
-
-/* VIA Cyrix defined MSRs*/
-#define MSR_VIA_FCR			0x1107
-#define MSR_VIA_LONGHAUL		0x110a
-#define MSR_VIA_RNG			0x110b
-#define MSR_VIA_BCR2			0x1147
-
-/* Intel defined MSRs. */
-#define MSR_IA32_P5_MC_ADDR		0
-#define MSR_IA32_P5_MC_TYPE		1
-#define MSR_IA32_PLATFORM_ID		0x17
-#define MSR_IA32_EBL_CR_POWERON		0x2a
-
-#define MSR_IA32_APICBASE               0x1b
-#define MSR_IA32_APICBASE_BSP           (1<<8)
-#define MSR_IA32_APICBASE_ENABLE        (1<<11)
-#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
-
-/* P4/Xeon+ specific */
-#define MSR_IA32_MCG_EAX		0x180
-#define MSR_IA32_MCG_EBX		0x181
-#define MSR_IA32_MCG_ECX		0x182
-#define MSR_IA32_MCG_EDX		0x183
-#define MSR_IA32_MCG_ESI		0x184
-#define MSR_IA32_MCG_EDI		0x185
-#define MSR_IA32_MCG_EBP		0x186
-#define MSR_IA32_MCG_ESP		0x187
-#define MSR_IA32_MCG_EFLAGS		0x188
-#define MSR_IA32_MCG_EIP		0x189
-#define MSR_IA32_MCG_RESERVED		0x18A
-
-#define MSR_P6_EVNTSEL0			0x186
-#define MSR_P6_EVNTSEL1			0x187
-
-#define MSR_IA32_PERF_STATUS		0x198
-#define MSR_IA32_PERF_CTL		0x199
-
-#define MSR_IA32_MPERF			0xE7
-#define MSR_IA32_APERF			0xE8
-
-#define MSR_IA32_THERM_CONTROL		0x19a
-#define MSR_IA32_THERM_INTERRUPT	0x19b
-#define MSR_IA32_THERM_STATUS		0x19c
-#define MSR_IA32_MISC_ENABLE		0x1a0
-
-#define MSR_IA32_DEBUGCTLMSR		0x1d9
-#define MSR_IA32_LASTBRANCHFROMIP	0x1db
-#define MSR_IA32_LASTBRANCHTOIP		0x1dc
-#define MSR_IA32_LASTINTFROMIP		0x1dd
-#define MSR_IA32_LASTINTTOIP		0x1de
-
-#define MSR_IA32_MC0_CTL		0x400
-#define MSR_IA32_MC0_STATUS		0x401
-#define MSR_IA32_MC0_ADDR		0x402
-#define MSR_IA32_MC0_MISC		0x403
-
-/* Pentium IV performance counter MSRs */
-#define MSR_P4_BPU_PERFCTR0 		0x300
-#define MSR_P4_BPU_PERFCTR1 		0x301
-#define MSR_P4_BPU_PERFCTR2 		0x302
-#define MSR_P4_BPU_PERFCTR3 		0x303
-#define MSR_P4_MS_PERFCTR0 		0x304
-#define MSR_P4_MS_PERFCTR1 		0x305
-#define MSR_P4_MS_PERFCTR2 		0x306
-#define MSR_P4_MS_PERFCTR3 		0x307
-#define MSR_P4_FLAME_PERFCTR0 		0x308
-#define MSR_P4_FLAME_PERFCTR1 		0x309
-#define MSR_P4_FLAME_PERFCTR2 		0x30a
-#define MSR_P4_FLAME_PERFCTR3 		0x30b
-#define MSR_P4_IQ_PERFCTR0 		0x30c
-#define MSR_P4_IQ_PERFCTR1 		0x30d
-#define MSR_P4_IQ_PERFCTR2 		0x30e
-#define MSR_P4_IQ_PERFCTR3 		0x30f
-#define MSR_P4_IQ_PERFCTR4 		0x310
-#define MSR_P4_IQ_PERFCTR5 		0x311
-#define MSR_P4_BPU_CCCR0 		0x360
-#define MSR_P4_BPU_CCCR1 		0x361
-#define MSR_P4_BPU_CCCR2 		0x362
-#define MSR_P4_BPU_CCCR3 		0x363
-#define MSR_P4_MS_CCCR0 		0x364
-#define MSR_P4_MS_CCCR1 		0x365
-#define MSR_P4_MS_CCCR2 		0x366
-#define MSR_P4_MS_CCCR3 		0x367
-#define MSR_P4_FLAME_CCCR0 		0x368
-#define MSR_P4_FLAME_CCCR1 		0x369
-#define MSR_P4_FLAME_CCCR2 		0x36a
-#define MSR_P4_FLAME_CCCR3 		0x36b
-#define MSR_P4_IQ_CCCR0 		0x36c
-#define MSR_P4_IQ_CCCR1 		0x36d
-#define MSR_P4_IQ_CCCR2 		0x36e
-#define MSR_P4_IQ_CCCR3 		0x36f
-#define MSR_P4_IQ_CCCR4 		0x370
-#define MSR_P4_IQ_CCCR5 		0x371
-#define MSR_P4_ALF_ESCR0 		0x3ca
-#define MSR_P4_ALF_ESCR1 		0x3cb
-#define MSR_P4_BPU_ESCR0 		0x3b2
-#define MSR_P4_BPU_ESCR1 		0x3b3
-#define MSR_P4_BSU_ESCR0 		0x3a0
-#define MSR_P4_BSU_ESCR1 		0x3a1
-#define MSR_P4_CRU_ESCR0 		0x3b8
-#define MSR_P4_CRU_ESCR1 		0x3b9
-#define MSR_P4_CRU_ESCR2 		0x3cc
-#define MSR_P4_CRU_ESCR3 		0x3cd
-#define MSR_P4_CRU_ESCR4 		0x3e0
-#define MSR_P4_CRU_ESCR5 		0x3e1
-#define MSR_P4_DAC_ESCR0 		0x3a8
-#define MSR_P4_DAC_ESCR1 		0x3a9
-#define MSR_P4_FIRM_ESCR0 		0x3a4
-#define MSR_P4_FIRM_ESCR1 		0x3a5
-#define MSR_P4_FLAME_ESCR0 		0x3a6
-#define MSR_P4_FLAME_ESCR1 		0x3a7
-#define MSR_P4_FSB_ESCR0 		0x3a2
-#define MSR_P4_FSB_ESCR1 		0x3a3
-#define MSR_P4_IQ_ESCR0 		0x3ba
-#define MSR_P4_IQ_ESCR1 		0x3bb
-#define MSR_P4_IS_ESCR0 		0x3b4
-#define MSR_P4_IS_ESCR1 		0x3b5
-#define MSR_P4_ITLB_ESCR0 		0x3b6
-#define MSR_P4_ITLB_ESCR1 		0x3b7
-#define MSR_P4_IX_ESCR0 		0x3c8
-#define MSR_P4_IX_ESCR1 		0x3c9
-#define MSR_P4_MOB_ESCR0 		0x3aa
-#define MSR_P4_MOB_ESCR1 		0x3ab
-#define MSR_P4_MS_ESCR0 		0x3c0
-#define MSR_P4_MS_ESCR1 		0x3c1
-#define MSR_P4_PMH_ESCR0 		0x3ac
-#define MSR_P4_PMH_ESCR1 		0x3ad
-#define MSR_P4_RAT_ESCR0 		0x3bc
-#define MSR_P4_RAT_ESCR1 		0x3bd
-#define MSR_P4_SAAT_ESCR0 		0x3ae
-#define MSR_P4_SAAT_ESCR1 		0x3af
-#define MSR_P4_SSU_ESCR0 		0x3be
-#define MSR_P4_SSU_ESCR1 		0x3bf    /* guess: not defined in manual */
-#define MSR_P4_TBPU_ESCR0 		0x3c2
-#define MSR_P4_TBPU_ESCR1 		0x3c3
-#define MSR_P4_TC_ESCR0 		0x3c4
-#define MSR_P4_TC_ESCR1 		0x3c5
-#define MSR_P4_U2L_ESCR0 		0x3b0
-#define MSR_P4_U2L_ESCR1 		0x3b1
-
-/* Intel Core-based CPU performance counters */
-#define MSR_CORE_PERF_FIXED_CTR0	0x309
-#define MSR_CORE_PERF_FIXED_CTR1	0x30a
-#define MSR_CORE_PERF_FIXED_CTR2	0x30b
-#define MSR_CORE_PERF_FIXED_CTR_CTRL	0x38d
-#define MSR_CORE_PERF_GLOBAL_STATUS	0x38e
-#define MSR_CORE_PERF_GLOBAL_CTRL	0x38f
-#define MSR_CORE_PERF_GLOBAL_OVF_CTRL	0x390
-
-#endif
+#endif  /* CONFIG_SMP */
+#endif /* __ASSEMBLY__ */
+#endif	/* X86_64_MSR_H */
diff --git a/include/asm-x86_64/processor-flags.h b/include/asm-x86_64/processor-flags.h
index 806112fb7985..ec99a57b2c6a 100644
--- a/include/asm-x86_64/processor-flags.h
+++ b/include/asm-x86_64/processor-flags.h
@@ -1,26 +1 @@
-#ifndef __ASM_X86_64_PROCESSOR_FLAGS_H
-#define __ASM_X86_64_PROCESSOR_FLAGS_H
-/* Various flags defined: can be included from assembler. */
-
-/*
- * EFLAGS bits
- */
-#define X86_EFLAGS_CF	0x00000001 /* Carry Flag */
-#define X86_EFLAGS_PF	0x00000004 /* Parity Flag */
-#define X86_EFLAGS_AF	0x00000010 /* Auxillary carry Flag */
-#define X86_EFLAGS_ZF	0x00000040 /* Zero Flag */
-#define X86_EFLAGS_SF	0x00000080 /* Sign Flag */
-#define X86_EFLAGS_TF	0x00000100 /* Trap Flag */
-#define X86_EFLAGS_IF	0x00000200 /* Interrupt Flag */
-#define X86_EFLAGS_DF	0x00000400 /* Direction Flag */
-#define X86_EFLAGS_OF	0x00000800 /* Overflow Flag */
-#define X86_EFLAGS_IOPL	0x00003000 /* IOPL mask */
-#define X86_EFLAGS_NT	0x00004000 /* Nested Task */
-#define X86_EFLAGS_RF	0x00010000 /* Resume Flag */
-#define X86_EFLAGS_VM	0x00020000 /* Virtual Mode */
-#define X86_EFLAGS_AC	0x00040000 /* Alignment Check */
-#define X86_EFLAGS_VIF	0x00080000 /* Virtual Interrupt Flag */
-#define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
-#define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
-
-#endif /* __ASM_X86_64_PROCESSOR_FLAGS_H */
+#include <asm-i386/processor-flags.h>
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 6a117349a5de..461ffe4c1fcc 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -103,21 +103,6 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
 
-/*
- * Intel CPU features in CR4
- */
-#define X86_CR4_VME		0x0001	/* enable vm86 extensions */
-#define X86_CR4_PVI		0x0002	/* virtual interrupts flag enable */
-#define X86_CR4_TSD		0x0004	/* disable time stamp at ipl 3 */
-#define X86_CR4_DE		0x0008	/* enable debugging extensions */
-#define X86_CR4_PSE		0x0010	/* enable page size extensions */
-#define X86_CR4_PAE		0x0020	/* enable physical address extensions */
-#define X86_CR4_MCE		0x0040	/* Machine check enable */
-#define X86_CR4_PGE		0x0080	/* enable global pages */
-#define X86_CR4_PCE		0x0100	/* enable performance counters at ipl 3 */
-#define X86_CR4_OSFXSR		0x0200	/* enable fast FPU save and restore */
-#define X86_CR4_OSXMMEXCPT	0x0400	/* enable unmasked SSE exceptions */
-
 /*
  * Save the cr4 feature set we're using (ie
  * Pentium 4MB enable and PPro Global page
@@ -406,22 +391,6 @@ static inline void prefetchw(void *x)
 
 #define cpu_relax()   rep_nop()
 
-/*
- *      NSC/Cyrix CPU configuration register indexes
- */
-#define CX86_CCR0 0xc0
-#define CX86_CCR1 0xc1
-#define CX86_CCR2 0xc2
-#define CX86_CCR3 0xc3
-#define CX86_CCR4 0xe8
-#define CX86_CCR5 0xe9
-#define CX86_CCR6 0xea
-#define CX86_CCR7 0xeb
-#define CX86_DIR0 0xfe
-#define CX86_DIR1 0xff
-#define CX86_ARR_BASE 0xc4
-#define CX86_RCR_BASE 0xdc
-
 /*
  *      NSC/Cyrix CPU indexed register access macros
  */
-- 
cgit v1.2.3


From d6dd61c831226f9cd7750885da04d360d6455101 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:14 +0200
Subject: [PATCH] x86: PARAVIRT: add hooks to intercept mm creation and
 destruction

Add hooks to allow a paravirt implementation to track the lifetime of
an mm.  Paravirtualization requires three hooks, but only two are
needed in common code.  They are:

arch_dup_mmap, which is called when a new mmap is created at fork

arch_exit_mmap, which is called when the last process reference to an
  mm is dropped, which typically happens on exit and exec.

The third hook is activate_mm, which is called from the arch-specific
activate_mm() macro/function, and so doesn't need stub versions for
other architectures.  It's called when an mm is first used.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: linux-arch@vger.kernel.org
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/i386/kernel/paravirt.c         |  4 ++++
 include/asm-alpha/mmu_context.h     |  1 +
 include/asm-arm/mmu_context.h       |  1 +
 include/asm-arm26/mmu_context.h     |  2 ++
 include/asm-avr32/mmu_context.h     |  1 +
 include/asm-cris/mmu_context.h      |  2 ++
 include/asm-frv/mmu_context.h       |  1 +
 include/asm-generic/mm_hooks.h      | 18 ++++++++++++++++++
 include/asm-h8300/mmu_context.h     |  1 +
 include/asm-i386/mmu_context.h      | 17 +++++++++++++++--
 include/asm-i386/paravirt.h         | 23 +++++++++++++++++++++++
 include/asm-ia64/mmu_context.h      |  1 +
 include/asm-m32r/mmu_context.h      |  1 +
 include/asm-m68k/mmu_context.h      |  1 +
 include/asm-m68knommu/mmu_context.h |  1 +
 include/asm-mips/mmu_context.h      |  1 +
 include/asm-parisc/mmu_context.h    |  1 +
 include/asm-powerpc/mmu_context.h   |  1 +
 include/asm-ppc/mmu_context.h       |  1 +
 include/asm-s390/mmu_context.h      |  2 ++
 include/asm-sh/mmu_context.h        |  1 +
 include/asm-sh64/mmu_context.h      |  2 +-
 include/asm-sparc/mmu_context.h     |  2 ++
 include/asm-sparc64/mmu_context.h   |  1 +
 include/asm-um/mmu_context.h        |  2 ++
 include/asm-v850/mmu_context.h      |  2 ++
 include/asm-x86_64/mmu_context.h    |  1 +
 include/asm-xtensa/mmu_context.h    |  1 +
 kernel/fork.c                       |  2 ++
 mm/mmap.c                           |  4 ++++
 30 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 include/asm-generic/mm_hooks.h

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 2040a831d5b3..54cc14d99740 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -237,6 +237,10 @@ struct paravirt_ops paravirt_ops = {
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 
+	.dup_mmap = paravirt_nop,
+	.exit_mmap = paravirt_nop,
+	.activate_mm = paravirt_nop,
+
 	.startup_ipi_hook = paravirt_nop,
 };
 
diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h
index fe249e9d3360..0bd7bd2ccb90 100644
--- a/include/asm-alpha/mmu_context.h
+++ b/include/asm-alpha/mmu_context.h
@@ -10,6 +10,7 @@
 #include <asm/system.h>
 #include <asm/machvec.h>
 #include <asm/compiler.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * Force a context reload. This is needed when we change the page
diff --git a/include/asm-arm/mmu_context.h b/include/asm-arm/mmu_context.h
index d1a65b1edcaa..f8755c818b54 100644
--- a/include/asm-arm/mmu_context.h
+++ b/include/asm-arm/mmu_context.h
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <asm/cacheflush.h>
 #include <asm/proc-fns.h>
+#include <asm-generic/mm_hooks.h>
 
 void __check_kvm_seq(struct mm_struct *mm);
 
diff --git a/include/asm-arm26/mmu_context.h b/include/asm-arm26/mmu_context.h
index 1a929bfe5c3a..16c821f81b8d 100644
--- a/include/asm-arm26/mmu_context.h
+++ b/include/asm-arm26/mmu_context.h
@@ -13,6 +13,8 @@
 #ifndef __ASM_ARM_MMU_CONTEXT_H
 #define __ASM_ARM_MMU_CONTEXT_H
 
+#include <asm-generic/mm_hooks.h>
+
 #define init_new_context(tsk,mm)	0
 #define destroy_context(mm)		do { } while(0)
 
diff --git a/include/asm-avr32/mmu_context.h b/include/asm-avr32/mmu_context.h
index 31add1ae8089..c37c391faef6 100644
--- a/include/asm-avr32/mmu_context.h
+++ b/include/asm-avr32/mmu_context.h
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/sysreg.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * The MMU "context" consists of two things:
diff --git a/include/asm-cris/mmu_context.h b/include/asm-cris/mmu_context.h
index e6e659dc757b..72ba08dcfd18 100644
--- a/include/asm-cris/mmu_context.h
+++ b/include/asm-cris/mmu_context.h
@@ -1,6 +1,8 @@
 #ifndef __CRIS_MMU_CONTEXT_H
 #define __CRIS_MMU_CONTEXT_H
 
+#include <asm-generic/mm_hooks.h>
+
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void get_mmu_context(struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
diff --git a/include/asm-frv/mmu_context.h b/include/asm-frv/mmu_context.h
index 72edcaaccd5d..c7daa395156a 100644
--- a/include/asm-frv/mmu_context.h
+++ b/include/asm-frv/mmu_context.h
@@ -15,6 +15,7 @@
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
new file mode 100644
index 000000000000..67dea8123683
--- /dev/null
+++ b/include/asm-generic/mm_hooks.h
@@ -0,0 +1,18 @@
+/*
+ * Define generic no-op hooks for arch_dup_mmap and arch_exit_mmap, to
+ * be included in asm-FOO/mmu_context.h for any arch FOO which doesn't
+ * need to hook these.
+ */
+#ifndef _ASM_GENERIC_MM_HOOKS_H
+#define _ASM_GENERIC_MM_HOOKS_H
+
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+				 struct mm_struct *mm)
+{
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+#endif	/* _ASM_GENERIC_MM_HOOKS_H */
diff --git a/include/asm-h8300/mmu_context.h b/include/asm-h8300/mmu_context.h
index 5c165f7bee0e..f44b730da54d 100644
--- a/include/asm-h8300/mmu_context.h
+++ b/include/asm-h8300/mmu_context.h
@@ -4,6 +4,7 @@
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index e6aa30f8de5b..8198d1cca1f3 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -5,6 +5,16 @@
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#ifndef CONFIG_PARAVIRT
+#include <asm-generic/mm_hooks.h>
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+					struct mm_struct *next)
+{
+}
+#endif	/* !CONFIG_PARAVIRT */
+
 
 /*
  * Used for LDT copy/destruction.
@@ -65,7 +75,10 @@ static inline void switch_mm(struct mm_struct *prev,
 #define deactivate_mm(tsk, mm)			\
 	asm("movl %0,%%gs": :"r" (0));
 
-#define activate_mm(prev, next) \
-	switch_mm((prev),(next),NULL)
+#define activate_mm(prev, next)				\
+	do {						\
+		paravirt_activate_mm(prev, next);	\
+		switch_mm((prev),(next),NULL);		\
+	} while(0);
 
 #endif
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index f93599dc7756..61c03f1e0c29 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -119,6 +119,12 @@ struct paravirt_ops
 
 	void (*io_delay)(void);
 
+	void (*activate_mm)(struct mm_struct *prev,
+			    struct mm_struct *next);
+	void (*dup_mmap)(struct mm_struct *oldmm,
+			 struct mm_struct *mm);
+	void (*exit_mmap)(struct mm_struct *mm);
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	void (*apic_write)(unsigned long reg, unsigned long v);
 	void (*apic_write_atomic)(unsigned long reg, unsigned long v);
@@ -395,6 +401,23 @@ static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+					struct mm_struct *next)
+{
+	paravirt_ops.activate_mm(prev, next);
+}
+
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+				 struct mm_struct *mm)
+{
+	paravirt_ops.dup_mmap(oldmm, mm);
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+	paravirt_ops.exit_mmap(mm);
+}
+
 #define __flush_tlb() paravirt_ops.flush_tlb_user()
 #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
 #define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
diff --git a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h
index b5c65081a3aa..cef2400983fa 100644
--- a/include/asm-ia64/mmu_context.h
+++ b/include/asm-ia64/mmu_context.h
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 
 #include <asm/processor.h>
+#include <asm-generic/mm_hooks.h>
 
 struct ia64_ctx {
 	spinlock_t lock;
diff --git a/include/asm-m32r/mmu_context.h b/include/asm-m32r/mmu_context.h
index 1f40d4a0acf1..91909e5dd9d0 100644
--- a/include/asm-m32r/mmu_context.h
+++ b/include/asm-m32r/mmu_context.h
@@ -15,6 +15,7 @@
 #include <asm/pgalloc.h>
 #include <asm/mmu.h>
 #include <asm/tlbflush.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * Cache of MMU context last used.
diff --git a/include/asm-m68k/mmu_context.h b/include/asm-m68k/mmu_context.h
index 231d11bd8e32..894dacbcee14 100644
--- a/include/asm-m68k/mmu_context.h
+++ b/include/asm-m68k/mmu_context.h
@@ -1,6 +1,7 @@
 #ifndef __M68K_MMU_CONTEXT_H
 #define __M68K_MMU_CONTEXT_H
 
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-m68knommu/mmu_context.h b/include/asm-m68knommu/mmu_context.h
index 6c077d3a2572..9ccee4278c97 100644
--- a/include/asm-m68knommu/mmu_context.h
+++ b/include/asm-m68knommu/mmu_context.h
@@ -4,6 +4,7 @@
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-mips/mmu_context.h b/include/asm-mips/mmu_context.h
index fe065d6070ca..65024ffd7879 100644
--- a/include/asm-mips/mmu_context.h
+++ b/include/asm-mips/mmu_context.h
@@ -20,6 +20,7 @@
 #include <asm/mipsmtregs.h>
 #include <asm/smtc.h>
 #endif /* SMTC */
+#include <asm-generic/mm_hooks.h>
 
 /*
  * For the fast tlb miss handlers, we keep a per cpu array of pointers
diff --git a/include/asm-parisc/mmu_context.h b/include/asm-parisc/mmu_context.h
index 9c05836239a2..bad690298f0c 100644
--- a/include/asm-parisc/mmu_context.h
+++ b/include/asm-parisc/mmu_context.h
@@ -5,6 +5,7 @@
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-powerpc/mmu_context.h b/include/asm-powerpc/mmu_context.h
index 083ac917bd29..c0d7795e3d25 100644
--- a/include/asm-powerpc/mmu_context.h
+++ b/include/asm-powerpc/mmu_context.h
@@ -10,6 +10,7 @@
 #include <linux/mm.h>	
 #include <asm/mmu.h>	
 #include <asm/cputable.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * Copyright (C) 2001 PPC 64 Team, IBM Corp
diff --git a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h
index 2bc8589cc451..a6441a063e5d 100644
--- a/include/asm-ppc/mmu_context.h
+++ b/include/asm-ppc/mmu_context.h
@@ -6,6 +6,7 @@
 #include <asm/bitops.h>
 #include <asm/mmu.h>
 #include <asm/cputable.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
diff --git a/include/asm-s390/mmu_context.h b/include/asm-s390/mmu_context.h
index 1d21da220d49..501cb9b06314 100644
--- a/include/asm-s390/mmu_context.h
+++ b/include/asm-s390/mmu_context.h
@@ -10,6 +10,8 @@
 #define __S390_MMU_CONTEXT_H
 
 #include <asm/pgalloc.h>
+#include <asm-generic/mm_hooks.h>
+
 /*
  * get a new mmu context.. S390 don't know about contexts.
  */
diff --git a/include/asm-sh/mmu_context.h b/include/asm-sh/mmu_context.h
index 342024425b7d..01acaaae9751 100644
--- a/include/asm-sh/mmu_context.h
+++ b/include/asm-sh/mmu_context.h
@@ -12,6 +12,7 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * The MMU "context" consists of two things:
diff --git a/include/asm-sh64/mmu_context.h b/include/asm-sh64/mmu_context.h
index 8c860dab2d0e..507bf72bb8e1 100644
--- a/include/asm-sh64/mmu_context.h
+++ b/include/asm-sh64/mmu_context.h
@@ -27,7 +27,7 @@
 extern unsigned long mmu_context_cache;
 
 #include <asm/page.h>
-
+#include <asm-generic/mm_hooks.h>
 
 /* Current mm's pgd */
 extern pgd_t *mmu_pdtp_cache;
diff --git a/include/asm-sparc/mmu_context.h b/include/asm-sparc/mmu_context.h
index ed1e01d04d21..671a997b9e69 100644
--- a/include/asm-sparc/mmu_context.h
+++ b/include/asm-sparc/mmu_context.h
@@ -5,6 +5,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm-generic/mm_hooks.h>
+
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h
index 2337eb487719..8d129032013e 100644
--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -9,6 +9,7 @@
 #include <linux/spinlock.h>
 #include <asm/system.h>
 #include <asm/spitfire.h>
+#include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h
index f709c784bf12..9aa4b44e8cc1 100644
--- a/include/asm-um/mmu_context.h
+++ b/include/asm-um/mmu_context.h
@@ -6,6 +6,8 @@
 #ifndef __UM_MMU_CONTEXT_H
 #define __UM_MMU_CONTEXT_H
 
+#include <asm-generic/mm_hooks.h>
+
 #include "linux/sched.h"
 #include "choose-mode.h"
 #include "um_mmu.h"
diff --git a/include/asm-v850/mmu_context.h b/include/asm-v850/mmu_context.h
index f521c8050d3c..01daacd5474e 100644
--- a/include/asm-v850/mmu_context.h
+++ b/include/asm-v850/mmu_context.h
@@ -1,6 +1,8 @@
 #ifndef __V850_MMU_CONTEXT_H__
 #define __V850_MMU_CONTEXT_H__
 
+#include <asm-generic/mm_hooks.h>
+
 #define destroy_context(mm)		((void)0)
 #define init_new_context(tsk,mm)	0
 #define switch_mm(prev,next,tsk)	((void)0)
diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index af03b9f852d6..0cce83a78378 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -7,6 +7,7 @@
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
+#include <asm-generic/mm_hooks.h>
 
 /*
  * possibly do the LDT unload here?
diff --git a/include/asm-xtensa/mmu_context.h b/include/asm-xtensa/mmu_context.h
index f14851f086c3..92f948392ebc 100644
--- a/include/asm-xtensa/mmu_context.h
+++ b/include/asm-xtensa/mmu_context.h
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm-generic/mm_hooks.h>
 
 #define XCHAL_MMU_ASID_BITS	8
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 6af959c034d8..ffccefb28b6a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		if (retval)
 			goto out;
 	}
+	/* a new mm has just been created */
+	arch_dup_mmap(oldmm, mm);
 	retval = 0;
 out:
 	up_write(&mm->mmap_sem);
diff --git a/mm/mmap.c b/mm/mmap.c
index 84f997da78d7..88da687bde89 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
+#include <asm/mmu_context.h>
 
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
@@ -1979,6 +1980,9 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long nr_accounted = 0;
 	unsigned long end;
 
+	/* mm's last user has gone, and its about to be pulled down */
+	arch_exit_mmap(mm);
+
 	lru_add_drain();
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
-- 
cgit v1.2.3


From 441d40dca024deb305a5e3d5003e8cd9d364d10f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:16 +0200
Subject: [PATCH] x86: PARAVIRT: Jeremy Fitzhardinge <jeremy@goop.org>

The other symbols used to delineate the alt-instructions sections have the
form __foo/__foo_end.  Rename parainstructions to match.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/i386/kernel/alternative.c   |  2 +-
 arch/i386/kernel/vmi.c           | 10 +++-------
 arch/i386/kernel/vmlinux.lds.S   |  4 ++--
 include/asm-i386/alternative.h   |  4 ++--
 include/asm-x86_64/alternative.h |  4 ++--
 5 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 080a59d56ea6..e5cec6685cc5 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -399,6 +399,6 @@ void __init alternative_instructions(void)
 		alternatives_smp_switch(0);
 	}
 #endif
- 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+ 	apply_paravirt(__parainstructions, __parainstructions_end);
 	local_irq_restore(flags);
 }
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 0fae15dee765..c8726c424b35 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -73,10 +73,6 @@ static struct {
   	void (*set_lazy_mode)(int mode);
 } vmi_ops;
 
-/* XXX move this to alternative.h */
-extern struct paravirt_patch __start_parainstructions[],
-	__stop_parainstructions[];
-
 /* Cached VMI operations */
 struct vmi_timer_ops vmi_timer_ops;
 
@@ -548,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_set_lazy_mode(int mode)
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
 {
-	static DEFINE_PER_CPU(int, lazy_mode);
+	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
 
 	if (!vmi_ops.set_lazy_mode)
 		return;
@@ -912,7 +908,7 @@ static inline int __init activate_vmi(void)
 	 * to do this before IRQs get reenabled.  Fortunately, it is
 	 * idempotent.
 	 */
-	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	vmi_bringup();
 
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index d125784ddf5e..568caca87715 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -166,9 +166,9 @@ SECTIONS
   }
   . = ALIGN(4);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  	__start_parainstructions = .;
+  	__parainstructions = .;
 	*(.parainstructions)
-  	__stop_parainstructions = .;
+  	__parainstructions_end = .;
   }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h
index 5b59d07e9d29..277467329583 100644
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -124,8 +124,8 @@ static inline void
 apply_paravirt(struct paravirt_patch_site *start,
 	       struct paravirt_patch_site *end)
 {}
-#define __start_parainstructions NULL
-#define __stop_parainstructions NULL
+#define __parainstructions	NULL
+#define __parainstructions_end	NULL
 #endif
 
 #endif /* _I386_ALTERNATIVE_H */
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index 67ebea3cc481..a09fe85c268e 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -142,8 +142,8 @@ void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
 static inline void
 apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
 {}
-#define __start_parainstructions NULL
-#define __stop_parainstructions NULL
+#define __parainstructions NULL
+#define __parainstructions_end NULL
 #endif
 
 #endif /* _X86_64_ALTERNATIVE_H */
-- 
cgit v1.2.3


From 856f44ff4af6e57fdc39a8b2bec498c88438bd27 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:17 +0200
Subject: [PATCH] x86-64: Move mtrr prototypes from proto.h to mtrr.h

Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/mtrr.h  | 8 ++++++++
 include/asm-x86_64/proto.h | 7 -------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index d6135b2549bf..66809eca98cf 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -135,6 +135,14 @@ struct mtrr_gentry32
 
 #endif /* CONFIG_COMPAT */
 
+#ifdef CONFIG_MTRR
+extern void mtrr_ap_init(void);
+extern void mtrr_bp_init(void);
+#else
+#define mtrr_ap_init() do {} while (0)
+#define mtrr_bp_init() do {} while (0)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif  /*  _LINUX_MTRR_H  */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 98063bcb3b33..85255db1e82d 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -14,13 +14,6 @@ extern void pda_init(int);
 extern void early_idt_handler(void);
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
-#ifdef CONFIG_MTRR
-extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
-#else
-#define mtrr_ap_init() do {} while (0)
-#define mtrr_bp_init() do {} while (0)
-#endif
 extern void init_memory_mapping(unsigned long start, unsigned long end);
 
 extern void system_call(void); 
-- 
cgit v1.2.3


From 2b3b4835c94226681c496de9446d456dcf42ed08 Mon Sep 17 00:00:00 2001
From: Bernhard Kaindl <bk@suse.de>
Date: Wed, 2 May 2007 19:27:17 +0200
Subject: [PATCH] x86: Adds mtrr_save_fixed_ranges() for use in two later
 patches.

In this current implementation which is used in other patches,
mtrr_save_fixed_ranges() accepts a dummy void pointer because
in the current implementation of one of these patches, this
function may be called from smp_call_function_single() which
requires that this function takes a void pointer argument.

This function calls get_fixed_ranges(), passing mtrr_state.fixed_ranges
which is the element of the static struct which stores our current
backup of the fixed-range MTRR values which all CPUs shall be
using.

Because  mtrr_save_fixed_ranges calls get_fixed_ranges after
kernel initialisation time, __init needs to be removed from
the declaration of get_fixed_ranges().

If CONFIG_MTRR is not set, we define mtrr_save_fixed_ranges
as an empty statement because there is nothing to do.

AK: Moved prototypes for x86-64 around to fix warnings

Signed-off-by: Bernhard Kaindl <bk@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
---
 arch/i386/kernel/cpu/mtrr/generic.c | 7 ++++++-
 include/asm-i386/mtrr.h             | 2 ++
 include/asm-x86_64/mtrr.h           | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 68b383788882..150cf5055a3c 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
 	rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
 }
 
-static void __init
+static void
 get_fixed_ranges(mtrr_type * frs)
 {
 	unsigned int *p = (unsigned int *) frs;
@@ -51,6 +51,11 @@ get_fixed_ranges(mtrr_type * frs)
 		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
+void mtrr_save_fixed_ranges(void *info)
+{
+	get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+
 static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
 {
 	unsigned i;
diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h
index 07f063ae26ea..02a41b99bd7e 100644
--- a/include/asm-i386/mtrr.h
+++ b/include/asm-i386/mtrr.h
@@ -69,6 +69,7 @@ struct mtrr_gentry
 
 /*  The following functions are for use by other drivers  */
 # ifdef CONFIG_MTRR
+extern void mtrr_save_fixed_ranges(void *);
 extern int mtrr_add (unsigned long base, unsigned long size,
 		     unsigned int type, char increment);
 extern int mtrr_add_page (unsigned long base, unsigned long size,
@@ -79,6 +80,7 @@ extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
 #  else
+#define mtrr_save_fixed_ranges(arg) do {} while (0)
 static __inline__ int mtrr_add (unsigned long base, unsigned long size,
 				unsigned int type, char increment)
 {
diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index 66809eca98cf..1b326cbb9305 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -138,9 +138,11 @@ struct mtrr_gentry32
 #ifdef CONFIG_MTRR
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
+extern void mtrr_save_fixed_ranges(void *);
 #else
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
+#define mtrr_save_fixed_ranges(arg) do {} while (0)
 #endif
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 2b1f6278d77c1f2f669346fc2bb48012b5e9495a Mon Sep 17 00:00:00 2001
From: Bernhard Kaindl <bk@suse.de>
Date: Wed, 2 May 2007 19:27:17 +0200
Subject: [PATCH] x86: Save the MTRRs of the BSP before booting an AP

Applied fix by Andew Morton:
http://lkml.org/lkml/2007/4/8/88 - Fix `make headers_check'.

AMD and Intel x86 CPU manuals state that it is the responsibility of
system software to initialize and maintain MTRR consistency across
all processors in Multi-Processing Environments.

Quote from page 188 of the AMD64 System Programming manual (Volume 2):

7.6.5 MTRRs in Multi-Processing Environments

"In multi-processing environments, the MTRRs located in all processors must
characterize memory in the same way. Generally, this means that identical
values are written to the MTRRs used by the processors." (short omission here)
"Failure to do so may result in coherency violations or loss of atomicity.
Processor implementations do not check the MTRR settings in other processors
to ensure consistency. It is the responsibility of system software to
initialize and maintain MTRR consistency across all processors."

Current Linux MTRR code already implements the above in the case that the
BIOS does not properly initialize MTRRs on the secondary processors,
but the case where the fixed-range MTRRs of the boot processor are changed
after Linux started to boot, before the initialsation of a secondary
processor, is not handled yet.

In this case, secondary processors are currently initialized by Linux
with MTRRs which the boot processor had very early, when mtrr_bp_init()
did run, but not with the MTRRs which the boot processor uses at the
time when that secondary processors is actually booted,
causing differing MTRR contents on the secondary processors.

Such situation happens on Acer Ferrari 1000 and 5000 notebooks where the
BIOS enables and sets AMD-specific IORR bits in the fixed-range MTRRs
of the boot processor when it transitions the system into ACPI mode.
The SMI handler of the BIOS does this in SMM, entered while Linux ACPI
code runs acpi_enable().

Other occasions where the SMI handler of the BIOS may change bits in
the MTRRs could occur as well. To initialize newly booted secodary
processors with the fixed-range MTRRs which the boot processor uses
at that time, this patch saves the fixed-range MTRRs of the boot
processor before new secondary processors are started. When the
secondary processors run their Linux initialisation code, their
fixed-range MTRRs will be updated with the saved fixed-range MTRRs.

If CONFIG_MTRR is not set, we define mtrr_save_state
as an empty statement because there is nothing to do.

Possible TODOs:

*) CPU-hotplugging outside of SMP suspend/resume is not yet tested
   with this patch.

*) If, even in this case, an AP never runs i386/do_boot_cpu or x86_64/cpu_up,
   then the calls to mtrr_save_state() could be replaced by calls to
   mtrr_save_fixed_ranges(NULL) and  mtrr_save_state() would not be
   needed.

   That would need either verification of the CPU-hotplug code or
   at least a test on a >2 CPU machine.

*) The MTRRs of other running processors are not yet checked at this
   time but it might be interesting to syncronize the MTTRs of all
   processors before booting. That would be an incremental patch,
   but of rather low priority since there is no machine known so
   far which would require this.

AK: moved prototypes on x86-64 around to fix warnings

Signed-off-by: Bernhard Kaindl <bk@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
---
 arch/i386/kernel/cpu/mtrr/main.c | 11 +++++++++++
 arch/i386/kernel/smpboot.c       |  7 +++++++
 arch/x86_64/kernel/smpboot.c     |  6 ++++++
 include/asm-i386/mtrr.h          |  2 ++
 include/asm-x86_64/mtrr.h        |  2 ++
 5 files changed, 28 insertions(+)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 0acfb6a5a220..02a2f39e5e0a 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -729,6 +729,17 @@ void mtrr_ap_init(void)
 	local_irq_restore(flags);
 }
 
+/**
+ * Save current fixed-range MTRR state of the BSP
+ */
+void mtrr_save_state(void)
+{
+	if (smp_processor_id() == 0)
+		mtrr_save_fixed_ranges(NULL);
+	else
+		smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+}
+
 static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 7c1dbef399cd..f14d93351a82 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -58,6 +58,7 @@
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
 #include <asm/vmi.h>
+#include <asm/mtrr.h>
 
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
@@ -814,6 +815,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	unsigned long start_eip;
 	unsigned short nmi_high = 0, nmi_low = 0;
 
+	/*
+	 * Save current MTRR state in case it was changed since early boot
+	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+	 */
+	mtrr_save_state();
+
 	/*
 	 * We can't use kernel_thread since we must avoid to
 	 * reschedule the child.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 14724be48bec..ddc392bee243 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -944,6 +944,12 @@ int __cpuinit __cpu_up(unsigned int cpu)
  		return -ENOSYS;
 	}
 
+	/*
+	 * Save current MTRR state in case it was changed since early boot
+	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+	 */
+	mtrr_save_state();
+
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h
index 02a41b99bd7e..7e9c7ccbdcfe 100644
--- a/include/asm-i386/mtrr.h
+++ b/include/asm-i386/mtrr.h
@@ -70,6 +70,7 @@ struct mtrr_gentry
 /*  The following functions are for use by other drivers  */
 # ifdef CONFIG_MTRR
 extern void mtrr_save_fixed_ranges(void *);
+extern void mtrr_save_state(void);
 extern int mtrr_add (unsigned long base, unsigned long size,
 		     unsigned int type, char increment);
 extern int mtrr_add_page (unsigned long base, unsigned long size,
@@ -81,6 +82,7 @@ extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
 #  else
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
+#define mtrr_save_state() do {} while (0)
 static __inline__ int mtrr_add (unsigned long base, unsigned long size,
 				unsigned int type, char increment)
 {
diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index 1b326cbb9305..b557c486bef8 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -139,10 +139,12 @@ struct mtrr_gentry32
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
 extern void mtrr_save_fixed_ranges(void *);
+extern void mtrr_save_state(void);
 #else
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
+#define mtrr_save_state() do {} while (0)
 #endif
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 8339e9fba33aa3205f541478c413982c0ac5a37f Mon Sep 17 00:00:00 2001
From: Fernando Luis VazquezCao <fernando@oss.ntt.co.jp>
Date: Wed, 2 May 2007 19:27:17 +0200
Subject: [PATCH] x86-64: safe_apic_wait_icr_idle - x86_64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

apic_wait_icr_idle looks like this:

static __inline__ void apic_wait_icr_idle(void)
{
  while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
    cpu_relax();
}

The busy loop in this function would not be problematic if the
corresponding status bit in the ICR were always updated, but that does
not seem to be the case under certain crash scenarios. Kdump uses an IPI
to stop the other CPUs in the event of a crash, but when any of the
other CPUs are locked-up inside the NMI handler the CPU that sends the
IPI will end up looping forever in the ICR check, effectively
hard-locking the whole system.

Quoting from Intel's "MultiProcessor Specification" (Version 1.4), B-3:

"A local APIC unit indicates successful dispatch of an IPI by
resetting the Delivery Status bit in the Interrupt Command
Register (ICR). The operating system polls the delivery status
bit after sending an INIT or STARTUP IPI until the command has
been dispatched.

A period of 20 microseconds should be sufficient for IPI dispatch
to complete under normal operating conditions. If the IPI is not
successfully dispatched, the operating system can abort the
command. Alternatively, the operating system can retry the IPI by
writing the lower 32-bit double word of the ICR. This “time-out”
mechanism can be implemented through an external interrupt, if
interrupts are enabled on the processor, or through execution of
an instruction or time-stamp counter spin loop."

Intel's documentation suggests the implementation of a time-out
mechanism, which, by the way, is already being open-coded in some parts
of the kernel that tinker with ICR.

Create a apic_wait_icr_idle replacement that implements the time-out
mechanism and that can be used to solve the aforementioned problem.

AK: moved both functions out of line
AK: Added improved loop from Keith Owens

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/apic.c | 22 ++++++++++++++++++++++
 include/asm-x86_64/apic.h |  8 +++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 3421f21b6c70..943ec4d1bd8e 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -68,6 +68,28 @@ int using_apic_timer __read_mostly = 0;
 
 static void apic_pm_activate(void);
 
+void apic_wait_icr_idle(void)
+{
+	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+		cpu_relax();
+}
+
+unsigned int safe_apic_wait_icr_idle(void)
+{
+	unsigned int send_status;
+	int timeout;
+
+	timeout = 0;
+	do {
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+		if (!send_status)
+			break;
+		udelay(100);
+	} while (timeout++ < 1000);
+
+	return send_status;
+}
+
 void enable_NMI_through_LVT0 (void * dummy)
 {
 	unsigned int v;
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 2f3b013595ab..45e9fca1febc 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -2,6 +2,7 @@
 #define __ASM_APIC_H
 
 #include <linux/pm.h>
+#include <linux/delay.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
@@ -47,11 +48,8 @@ static __inline unsigned int apic_read(unsigned long reg)
 	return *((volatile unsigned int *)(APIC_BASE+reg));
 }
 
-static __inline__ void apic_wait_icr_idle(void)
-{
-	while (apic_read( APIC_ICR ) & APIC_ICR_BUSY)
-		cpu_relax();
-}
+extern void apic_wait_icr_idle(void);
+extern unsigned int safe_apic_wait_icr_idle(void);
 
 static inline void ack_APIC_irq(void)
 {
-- 
cgit v1.2.3


From 9062d888aa448318e38792b6879a795dd10adda4 Mon Sep 17 00:00:00 2001
From: "Fernando Luis [** ISO-8859-1 charset **] V�zquezCao"
 <fernando@oss.ntt.co.jp>
Date: Wed, 2 May 2007 19:27:18 +0200
Subject: [PATCH] x86-64: __send_IPI_dest_field - x86_64

Implement __send_IPI_dest_field which can be used to send IPIs when the
"destination shorthand" field of the ICR is set to 00 (destination
field). Use it whenever possible.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/genapic_flat.c | 23 +----------------
 include/asm-x86_64/ipi.h          | 54 +++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 44 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 9e0a552f0e4a..01d3939e3a9c 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -61,31 +61,10 @@ static void flat_init_apic_ldr(void)
 static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 {
 	unsigned long mask = cpus_addr(cpumask)[0];
-	unsigned long cfg;
 	unsigned long flags;
 
 	local_irq_save(flags);
-
-	/*
-	 * Wait for idle.
-	 */
-	apic_wait_icr_idle();
-
-	/*
-	 * prepare target chip field
-	 */
-	cfg = __prepare_ICR2(mask);
-	apic_write(APIC_ICR2, cfg);
-
-	/*
-	 * program the ICR
-	 */
-	cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	apic_write(APIC_ICR, cfg);
+	__send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h
index ffa6f1517f1a..26961e671948 100644
--- a/include/asm-x86_64/ipi.h
+++ b/include/asm-x86_64/ipi.h
@@ -74,10 +74,39 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, unsign
 	apic_write(APIC_ICR, cfg);
 }
 
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+	unsigned long cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	apic_write(APIC_ICR2, cfg);
+
+	/*
+	 * program the ICR
+	 */
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write(APIC_ICR, cfg);
+}
 
 static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
 {
-	unsigned long cfg, flags;
+	unsigned long flags;
 	unsigned long query_cpu;
 
 	/*
@@ -86,28 +115,9 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
 	 * - mbligh
 	 */
 	local_irq_save(flags);
-
 	for_each_cpu_mask(query_cpu, mask) {
-		/*
-		 * Wait for idle.
-		 */
-		apic_wait_icr_idle();
-
-		/*
-		 * prepare target chip field
-		 */
-		cfg = __prepare_ICR2(x86_cpu_to_apicid[query_cpu]);
-		apic_write(APIC_ICR2, cfg);
-
-		/*
-		 * program the ICR
-		 */
-		cfg = __prepare_ICR(0, vector, APIC_DEST_PHYSICAL);
-
-		/*
-		 * Send the IPI. The write to APIC_ICR fires this off.
-		 */
-		apic_write(APIC_ICR, cfg);
+		__send_IPI_dest_field(x86_cpu_to_apicid[query_cpu],
+				      vector, APIC_DEST_PHYSICAL);
 	}
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 70ae77f497a57b3ef6b0987b6310327264517cb0 Mon Sep 17 00:00:00 2001
From: "Fernando Luis [** ISO-8859-1 charset **] V�zquezCao"
 <fernando@oss.ntt.co.jp>
Date: Wed, 2 May 2007 19:27:18 +0200
Subject: [PATCH] x86-64: Use safe_apic_wait_icr_idle in __send_IPI_dest_field
 - x86_64

Use safe_apic_wait_icr_idle to check ICR idle bit if the vector is
NMI_VECTOR to avoid potential hangups in the event of crash when kdump
tries to stop the other CPUs.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 include/asm-x86_64/ipi.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h
index 26961e671948..a7c75ea408a8 100644
--- a/include/asm-x86_64/ipi.h
+++ b/include/asm-x86_64/ipi.h
@@ -85,7 +85,10 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, unsigned
 	/*
 	 * Wait for idle.
 	 */
-	apic_wait_icr_idle();
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		apic_wait_icr_idle();
 
 	/*
 	 * prepare target chip field
-- 
cgit v1.2.3


From 57a4f91ae5571edd7c0428285d8df16bb8bf5f40 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:18 +0200
Subject: [PATCH] x86-64: Auto compute __NR_syscall_max at compile time

No need to maintain it anymore

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/asm-offsets.c | 10 ++++++++++
 arch/x86_64/kernel/syscall.c     |  1 +
 include/asm-x86_64/unistd.h      |  2 --
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index 96687e2beb2c..778953bc636c 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -21,6 +21,14 @@
 
 #define BLANK() asm volatile("\n->" : : )
 
+#define __NO_STUBS 1
+#undef __SYSCALL
+#undef _ASM_X86_64_UNISTD_H_
+#define __SYSCALL(nr, sym) [nr] = 1,
+static char syscalls[] = {
+#include <asm/unistd.h>
+};
+
 int main(void)
 {
 #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
@@ -71,5 +79,7 @@ int main(void)
 	DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
 	BLANK();
 	DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
+	BLANK();
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	return 0;
 }
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index 213fd6ab789d..63d592c276cc 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -3,6 +3,7 @@
 #include <linux/linkage.h>
 #include <linux/sys.h>
 #include <linux/cache.h>
+#include <asm/asm-offsets.h>
 
 #define __NO_STUBS
 
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 576b29732d3c..26e23e01c54a 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -620,8 +620,6 @@ __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
 
-#define __NR_syscall_max __NR_move_pages
-
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
-- 
cgit v1.2.3


From 05cb007dac9a50148daf87d0b9469e0cd05fd5e7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:20 +0200
Subject: [PATCH] x86-64: Use the 32bit wd_ops for 64bit too.

This mainly removes a lot of code, replacing it with calls into the new 32bit
perfctr-watchdog.c

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/Makefile |   4 +-
 arch/x86_64/kernel/nmi.c    | 678 +++-----------------------------------------
 include/asm-x86_64/nmi.h    |   9 +
 3 files changed, 45 insertions(+), 646 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index a613e13d0e75..4d94c51803d8 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,8 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o
+		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \
+		perfctr-watchdog.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_X86_MCE)		+= mce.o therm_throt.o
@@ -57,3 +58,4 @@ i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
 alternative-y			+= ../../i386/kernel/alternative.o
 pcspeaker-y			+= ../../i386/kernel/pcspeaker.o
+perfctr-watchdog-y		+= ../../i386/kernel/cpu/perfctr-watchdog.o
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 010d3d9bd56a..6cd2b30e2ffc 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -27,28 +27,11 @@
 #include <asm/proto.h>
 #include <asm/kdebug.h>
 #include <asm/mce.h>
-#include <asm/intel_arch_perfmon.h>
 
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
 int panic_on_unrecovered_nmi;
 
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
- * evtsel_nmi_owner tracks the ownership of the event selection
- * - different performance counters/ event selection may be reserved for
- *   different subsystems this reservation system just tries to coordinate
- *   things a little
- */
-
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
- */
-#define NMI_MAX_COUNTER_BITS 66
-#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
-
-static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-
 static cpumask_t backtrace_mask = CPU_MASK_NONE;
 
 /* nmi_active:
@@ -63,191 +46,11 @@ int panic_on_timeout;
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
 
-struct nmi_watchdog_ctlblk {
-	int enabled;
-	u64 check_bit;
-	unsigned int cccr_msr;
-	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
-	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
-};
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+static DEFINE_PER_CPU(short, wd_enabled);
 
 /* local prototypes */
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
 
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
-{
-	/* returns the bit offset of the performance counter register */
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return (msr - MSR_K7_PERFCTR0);
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return (msr - MSR_ARCH_PERFMON_PERFCTR0);
-		else
-			return (msr - MSR_P4_BPU_PERFCTR0);
-	}
-	return 0;
-}
-
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
-{
-	/* returns the bit offset of the event selection register */
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return (msr - MSR_K7_EVNTSEL0);
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
-		else
-			return (msr - MSR_P4_BSU_ESCR0);
-	}
-	return 0;
-}
-
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
-	int cpu;
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-	for_each_possible_cpu (cpu) {
-		if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
-			return 0;
-	}
-	return 1;
-}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
-	unsigned int counter;
-	int cpu;
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	for_each_possible_cpu (cpu) {
-		if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
-			return 0;
-	}
-	return 1;
-}
-
-static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
-		return 1;
-	return 0;
-}
-
-static void __release_perfctr_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu));
-}
-
-int reserve_perfctr_nmi(unsigned int msr)
-{
-	int cpu, i;
-	for_each_possible_cpu (cpu) {
-		if (!__reserve_perfctr_nmi(cpu, msr)) {
-			for_each_possible_cpu (i) {
-				if (i >= cpu)
-					break;
-				__release_perfctr_nmi(i, msr);
-			}
-			return 0;
-		}
-	}
-	return 1;
-}
-
-void release_perfctr_nmi(unsigned int msr)
-{
-	int cpu;
-	for_each_possible_cpu (cpu)
-		__release_perfctr_nmi(cpu, msr);
-}
-
-int __reserve_evntsel_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_evntsel_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
-		return 1;
-	return 0;
-}
-
-static void __release_evntsel_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_evntsel_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
-}
-
-int reserve_evntsel_nmi(unsigned int msr)
-{
-	int cpu, i;
-	for_each_possible_cpu (cpu) {
-		if (!__reserve_evntsel_nmi(cpu, msr)) {
-			for_each_possible_cpu (i) {
-				if (i >= cpu)
-					break;
-				__release_evntsel_nmi(i, msr);
-			}
-			return 0;
-		}
-	}
-	return 1;
-}
-
-void release_evntsel_nmi(unsigned int msr)
-{
-	int cpu;
-	for_each_possible_cpu (cpu) {
-		__release_evntsel_nmi(cpu, msr);
-	}
-}
-
-static __cpuinit inline int nmi_known_cpu(void)
-{
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return 1;
-		else
-			return (boot_cpu_data.x86 == 15);
-	}
-	return 0;
-}
-
 /* Run after command line and cpu_init init, but before all other checks */
 void nmi_watchdog_default(void)
 {
@@ -277,23 +80,6 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
-	unsigned int retval = hz;
-
-	/*
-	 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
-	 * are writable, with higher bits sign extending from bit 31.
-	 * So, we can only program the counter with 31 bit values and
-	 * 32nd bit should be 1, for 33.. to be 1.
-	 * Find the appropriate nmi_hz
-	 */
- 	if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
-		retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
-	}
-	return retval;
-}
-
 int __init check_nmi_watchdog (void)
 {
 	int *counts;
@@ -322,14 +108,14 @@ int __init check_nmi_watchdog (void)
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
 	for_each_online_cpu(cpu) {
-		if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+		if (!per_cpu(wd_enabled, cpu))
 			continue;
 		if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
 			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 			       cpu,
 			       counts[cpu],
 			       cpu_pda(cpu)->__nmi_count);
-			per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}
 	}
@@ -344,13 +130,8 @@ int __init check_nmi_watchdog (void)
 
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-		nmi_hz = 1;
-	 	if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
-			nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-	}
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		nmi_hz = lapic_adjust_nmi_hz(1);
 
 	kfree(counts);
 	return 0;
@@ -379,57 +160,6 @@ int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
-static void disable_lapic_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-	if (atomic_read(&nmi_active) <= 0)
-		return;
-
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
-	BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-static void enable_lapic_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-	/* are we already enabled */
-	if (atomic_read(&nmi_active) != 0)
-		return;
-
-	/* are we lapic aware */
-	if (nmi_known_cpu() <= 0)
-		return;
-
-	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
-	touch_nmi_watchdog();
-}
-
-void disable_timer_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
-	if (atomic_read(&nmi_active) <= 0)
-		return;
-
-	disable_irq(0);
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
-	BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_timer_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
-	if (atomic_read(&nmi_active) == 0) {
-		touch_nmi_watchdog();
-		on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
-		enable_irq(0);
-	}
-}
 
 static void __acpi_nmi_disable(void *__unused)
 {
@@ -515,275 +245,9 @@ late_initcall(init_lapic_nmi_sysfs);
 
 #endif	/* CONFIG_PM */
 
-/*
- * Activate the NMI watchdog via the local APIC.
- * Original code written by Keith Owens.
- */
-
-/* Note that these events don't tick when the CPU idles. This means
-   the frequency varies with CPU load. */
-
-#define K7_EVNTSEL_ENABLE	(1 << 22)
-#define K7_EVNTSEL_INT		(1 << 20)
-#define K7_EVNTSEL_OS		(1 << 17)
-#define K7_EVNTSEL_USR		(1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
-#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(void)
-{
-	unsigned int perfctr_msr, evntsel_msr;
-	unsigned int evntsel;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	perfctr_msr = MSR_K7_PERFCTR0;
-	evntsel_msr = MSR_K7_EVNTSEL0;
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	/* Simulator may not support it */
-	if (checking_wrmsrl(evntsel_msr, 0UL))
-		goto fail2;
-	wrmsrl(perfctr_msr, 0UL);
-
-	evntsel = K7_EVNTSEL_INT
-		| K7_EVNTSEL_OS
-		| K7_EVNTSEL_USR
-		| K7_NMI_EVENT;
-
-	/* setup the timer */
-	wrmsr(evntsel_msr, evntsel, 0);
-	wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= K7_EVNTSEL_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
-	wd->check_bit = 1ULL<<63;
-	return 1;
-fail2:
-	__release_evntsel_nmi(-1, evntsel_msr);
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
-}
-
-static void stop_k7_watchdog(void)
-{
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-/* Note that these events don't tick when the CPU idles. This means
-   the frequency varies with CPU load. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
-#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
-#define P4_ESCR_OS		(1<<3)
-#define P4_ESCR_USR		(1<<2)
-#define P4_CCCR_OVF_PMI0	(1<<26)
-#define P4_CCCR_OVF_PMI1	(1<<27)
-#define P4_CCCR_THRESHOLD(N)	((N)<<20)
-#define P4_CCCR_COMPLEMENT	(1<<19)
-#define P4_CCCR_COMPARE		(1<<18)
-#define P4_CCCR_REQUIRED	(3<<16)
-#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
-#define P4_CCCR_ENABLE		(1<<12)
-#define P4_CCCR_OVF 		(1<<31)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
-   CRU_ESCR0 (with any non-null event selector) through a complemented
-   max threshold. [IA32-Vol3, Section 14.9.9] */
-
-static int setup_p4_watchdog(void)
-{
-	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
-	unsigned int evntsel, cccr_val;
-	unsigned int misc_enable, dummy;
-	unsigned int ht_num;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
-	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-		return 0;
-
-#ifdef CONFIG_SMP
-	/* detect which hyperthread we are on */
-	if (smp_num_siblings == 2) {
-		unsigned int ebx, apicid;
-
-        	ebx = cpuid_ebx(1);
-	        apicid = (ebx >> 24) & 0xff;
-        	ht_num = apicid & 1;
-	} else
-#endif
-		ht_num = 0;
-
-	/* performance counters are shared resources
-	 * assign each hyperthread its own set
-	 * (re-use the ESCR0 register, seems safe
-	 * and keeps the cccr_val the same)
-	 */
-	if (!ht_num) {
-		/* logical cpu 0 */
-		perfctr_msr = MSR_P4_IQ_PERFCTR0;
-		evntsel_msr = MSR_P4_CRU_ESCR0;
-		cccr_msr = MSR_P4_IQ_CCCR0;
-		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-	} else {
-		/* logical cpu 1 */
-		perfctr_msr = MSR_P4_IQ_PERFCTR1;
-		evntsel_msr = MSR_P4_CRU_ESCR0;
-		cccr_msr = MSR_P4_IQ_CCCR1;
-		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
-	}
-
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
-	 	| P4_ESCR_OS
-		| P4_ESCR_USR;
-
-	cccr_val |= P4_CCCR_THRESHOLD(15)
-		 | P4_CCCR_COMPLEMENT
-		 | P4_CCCR_COMPARE
-		 | P4_CCCR_REQUIRED;
-
-	wrmsr(evntsel_msr, evntsel, 0);
-	wrmsr(cccr_msr, cccr_val, 0);
-	wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	cccr_val |= P4_CCCR_ENABLE;
-	wrmsr(cccr_msr, cccr_val, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = cccr_msr;
-	wd->check_bit = 1ULL<<39;
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
-}
-
-static void stop_p4_watchdog(void)
-{
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	wrmsr(wd->cccr_msr, 0, 0);
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static int setup_intel_arch_watchdog(void)
-{
-	unsigned int ebx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	unsigned int perfctr_msr, evntsel_msr;
-	unsigned int evntsel;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
-	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
-	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		goto fail;
-
-	perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1;
-	evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1;
-
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	wrmsrl(perfctr_msr, 0UL);
-
-	evntsel = ARCH_PERFMON_EVENTSEL_INT
-		| ARCH_PERFMON_EVENTSEL_OS
-		| ARCH_PERFMON_EVENTSEL_USR
-		| ARCH_PERFMON_NMI_EVENT_SEL
-		| ARCH_PERFMON_NMI_EVENT_UMASK;
-
-	/* setup the timer */
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-	wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
-
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
-	wd->check_bit = 1ULL << (eax.split.bit_width - 1);
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
-}
-
-static void stop_intel_arch_watchdog(void)
-{
-	unsigned int ebx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
-	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
-	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		return;
-
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
 void setup_apic_nmi_watchdog(void *unused)
 {
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/* only support LOCAL and IO APICs for now */
-	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-	    (nmi_watchdog != NMI_IO_APIC))
-	    	return;
-
-	if (wd->enabled == 1)
+	if (__get_cpu_var(wd_enabled) == 1)
 		return;
 
 	/* cheap hack to support suspend/resume */
@@ -791,62 +255,31 @@ void setup_apic_nmi_watchdog(void *unused)
 	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
 		return;
 
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_AMD:
-			if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
-				return;
-			if (!setup_k7_watchdog())
-				return;
-			break;
-		case X86_VENDOR_INTEL:
-			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-				if (!setup_intel_arch_watchdog())
-					return;
-				break;
-			}
-			if (!setup_p4_watchdog())
-				return;
-			break;
-		default:
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		__get_cpu_var(wd_enabled) = 1;
+		if (lapic_watchdog_init(nmi_hz) < 0) {
+			__get_cpu_var(wd_enabled) = 0;
 			return;
 		}
+		/* FALL THROUGH */
+	case NMI_IO_APIC:
+		__get_cpu_var(wd_enabled) = 1;
+		atomic_inc(&nmi_active);
 	}
-	wd->enabled = 1;
-	atomic_inc(&nmi_active);
 }
 
 void stop_apic_nmi_watchdog(void *unused)
 {
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
 	/* only support LOCAL and IO APICs for now */
 	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
 	    (nmi_watchdog != NMI_IO_APIC))
 	    	return;
-
-	if (wd->enabled == 0)
+	if (__get_cpu_var(wd_enabled) == 0)
 		return;
-
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_AMD:
-			if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
-				return;
-			stop_k7_watchdog();
-			break;
-		case X86_VENDOR_INTEL:
-			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-				stop_intel_arch_watchdog();
-				break;
-			}
-			stop_p4_watchdog();
-			break;
-		default:
-			return;
-		}
-	}
-	wd->enabled = 0;
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		lapic_watchdog_stop();
+	__get_cpu_var(wd_enabled) = 0;
 	atomic_dec(&nmi_active);
 }
 
@@ -885,9 +318,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 	int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-	u64 dummy;
-	int rc=0;
+	int rc = 0;
 
 	/* check for other users first */
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
@@ -934,55 +365,20 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 	}
 
 	/* see if the nmi watchdog went off */
-	if (wd->enabled) {
-		if (nmi_watchdog == NMI_LOCAL_APIC) {
-			rdmsrl(wd->perfctr_msr, dummy);
-			if (dummy & wd->check_bit){
-				/* this wasn't a watchdog timer interrupt */
-				goto done;
-			}
-
-			/* only Intel uses the cccr msr */
-	 		if (wd->cccr_msr != 0) {
-	 			/*
-	 			 * P4 quirks:
-	 			 * - An overflown perfctr will assert its interrupt
-	 			 *   until the OVF flag in its CCCR is cleared.
-	 			 * - LVTPC is masked on interrupt and must be
-	 			 *   unmasked by the LVTPC handler.
-	 			 */
-				rdmsrl(wd->cccr_msr, dummy);
-				dummy &= ~P4_CCCR_OVF;
-	 			wrmsrl(wd->cccr_msr, dummy);
-	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
-				/* start the cycle over again */
-				wrmsrl(wd->perfctr_msr,
-				       -((u64)cpu_khz * 1000 / nmi_hz));
-	 		} else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) {
-				/*
-				 * ArchPerfom/Core Duo needs to re-unmask
-				 * the apic vector
-				 */
-				apic_write(APIC_LVTPC, APIC_DM_NMI);
-				/* ARCH_PERFMON has 32 bit counter writes */
-				wrmsr(wd->perfctr_msr,
-				     (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
-			} else {
-				/* start the cycle over again */
-				wrmsrl(wd->perfctr_msr,
-				       -((u64)cpu_khz * 1000 / nmi_hz));
-			}
-			rc = 1;
-		} else 	if (nmi_watchdog == NMI_IO_APIC) {
-			/* don't know how to accurately check for this.
-			 * just assume it was a watchdog timer interrupt
-			 * This matches the old behaviour.
-			 */
-			rc = 1;
-		} else
-			printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
+	if (!__get_cpu_var(wd_enabled))
+		return rc;
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		rc |= lapic_wd_event(nmi_hz);
+		break;
+	case NMI_IO_APIC:
+		/* don't know how to accurately check for this.
+		 * just assume it was a watchdog timer interrupt
+		 * This matches the old behaviour.
+		 */
+		rc = 1;
+		break;
 	}
-done:
 	return rc;
 }
 
@@ -1067,12 +463,4 @@ void __trigger_all_cpu_backtrace(void)
 
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
-EXPORT_SYMBOL(release_evntsel_nmi);
-EXPORT_SYMBOL(disable_timer_nmi_watchdog);
-EXPORT_SYMBOL(enable_timer_nmi_watchdog);
 EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index 72375e7d32a8..d0a7f53b1497 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -80,4 +80,13 @@ extern int unknown_nmi_panic;
 void __trigger_all_cpu_backtrace(void);
 #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
 
+
+void lapic_watchdog_stop(void);
+int lapic_watchdog_init(unsigned nmi_hz);
+int lapic_wd_event(unsigned nmi_hz);
+unsigned lapic_adjust_nmi_hz(unsigned hz);
+int lapic_watchdog_ok(void);
+void disable_lapic_nmi_watchdog(void);
+void enable_lapic_nmi_watchdog(void);
+
 #endif /* ASM_NMI_H */
-- 
cgit v1.2.3