From 79121ea9f098934850347eeed3857ffd836810c3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:07 +0200
Subject: [PATCH] x86_64: Use __always_inline for __inline_memcpy

Inspired from i386 changes

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/string.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/string.h b/include/asm-x86_64/string.h
index ee6bf275349e..9505d9f4bead 100644
--- a/include/asm-x86_64/string.h
+++ b/include/asm-x86_64/string.h
@@ -6,7 +6,8 @@
 /* Written 2002 by Andi Kleen */ 
 
 /* Only used for special circumstances. Stolen from i386/string.h */ 
-static inline void * __inline_memcpy(void * to, const void * from, size_t n)
+static __always_inline void *
+__inline_memcpy(void * to, const void * from, size_t n)
 {
 unsigned long d0, d1, d2;
 __asm__ __volatile__(
-- 
cgit v1.2.3


From 240cd6a80642da528bfa382ec2ae4e3cb8991ea7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:13 +0200
Subject: [PATCH] i386/x86-64: Emulate CPUID4 on AMD

Intel systems report the cache level data from CPUID 4 in sysfs.
Add a CPUID 4 emulation for AMD CPUs to report the same
information for them. This allows programs to read this
information in a uniform way.

The AMD way to report this is less flexible so some assumptions
are hardcoded (e.g. no L3)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/amd.c             |   2 +
 arch/i386/kernel/cpu/intel_cacheinfo.c | 113 +++++++++++++++++++++++++++++----
 arch/x86_64/kernel/setup.c             |   3 +
 include/asm-i386/processor.h           |   1 +
 include/asm-x86_64/processor.h         |   1 +
 5 files changed, 107 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index b9c93d9789e7..fd0457c9c827 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -242,6 +242,8 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
+	if (cpuid_eax(0x80000000) >= 0x80000006)
+		num_cache_leaves = 3;
 }
 
 static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index c8547a6fa7e6..6c37b4fd8ce2 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,6 +4,7 @@
  *      Changes:
  *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
  *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
+ *	Andi Kleen		: CPUID4 emulation on AMD.
  */
 
 #include <linux/init.h>
@@ -130,25 +131,111 @@ struct _cpuid4_info {
 	cpumask_t shared_cpu_map;
 };
 
-static unsigned short			num_cache_leaves;
+unsigned short			num_cache_leaves;
+
+/* AMD doesn't have CPUID4. Emulate it here to report the same
+   information to the user.  This makes some assumptions about the machine:
+   No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+
+   In theory the TLBs could be reported as fake type (they are in "dummy").
+   Maybe later */
+union l1_cache {
+	struct {
+		unsigned line_size : 8;
+		unsigned lines_per_tag : 8;
+		unsigned assoc : 8;
+		unsigned size_in_kb : 8;
+	};
+	unsigned val;
+};
+
+union l2_cache {
+	struct {
+		unsigned line_size : 8;
+		unsigned lines_per_tag : 4;
+		unsigned assoc : 4;
+		unsigned size_in_kb : 16;
+	};
+	unsigned val;
+};
+
+static unsigned short assocs[] = {
+	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
+	[8] = 16,
+	[0xf] = 0xffff // ??
+	};
+static unsigned char levels[] = { 1, 1, 2 };
+static unsigned char types[] = { 1, 2, 3 };
+
+static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+		       union _cpuid4_leaf_ebx *ebx,
+		       union _cpuid4_leaf_ecx *ecx)
+{
+	unsigned dummy;
+	unsigned line_size, lines_per_tag, assoc, size_in_kb;
+	union l1_cache l1i, l1d;
+	union l2_cache l2;
+
+	eax->full = 0;
+	ebx->full = 0;
+	ecx->full = 0;
+
+	cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+	cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
+
+	if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
+		return;
+
+	eax->split.is_self_initializing = 1;
+	eax->split.type = types[leaf];
+	eax->split.level = levels[leaf];
+	eax->split.num_threads_sharing = 0;
+	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+
+	if (leaf <= 1) {
+		union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
+		assoc = l1->assoc;
+		line_size = l1->line_size;
+		lines_per_tag = l1->lines_per_tag;
+		size_in_kb = l1->size_in_kb;
+	} else {
+		assoc = l2.assoc;
+		line_size = l2.line_size;
+		lines_per_tag = l2.lines_per_tag;
+		/* cpu_data has errata corrections for K7 applied */
+		size_in_kb = current_cpu_data.x86_cache_size;
+	}
+
+	if (assoc == 0xf)
+		eax->split.is_fully_associative = 1;
+	ebx->split.coherency_line_size = line_size - 1;
+	ebx->split.ways_of_associativity = assocs[assoc] - 1;
+	ebx->split.physical_line_partition = lines_per_tag - 1;
+	ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+		(ebx->split.ways_of_associativity + 1) - 1;
+}
 
 static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
 {
-	unsigned int		eax, ebx, ecx, edx;
-	union _cpuid4_leaf_eax	cache_eax;
+	union _cpuid4_leaf_eax 	eax;
+	union _cpuid4_leaf_ebx 	ebx;
+	union _cpuid4_leaf_ecx 	ecx;
+	unsigned		edx;
 
-	cpuid_count(4, index, &eax, &ebx, &ecx, &edx);
-	cache_eax.full = eax;
-	if (cache_eax.split.type == CACHE_TYPE_NULL)
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		amd_cpuid4(index, &eax, &ebx, &ecx);
+	else
+		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full,  &edx);
+	if (eax.split.type == CACHE_TYPE_NULL)
 		return -EIO; /* better error ? */
 
-	this_leaf->eax.full = eax;
-	this_leaf->ebx.full = ebx;
-	this_leaf->ecx.full = ecx;
-	this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) *
-		(this_leaf->ebx.split.coherency_line_size + 1) *
-		(this_leaf->ebx.split.physical_line_partition + 1) *
-		(this_leaf->ebx.split.ways_of_associativity + 1);
+	this_leaf->eax = eax;
+	this_leaf->ebx = ebx;
+	this_leaf->ecx = ecx;
+	this_leaf->size = (ecx.split.number_of_sets + 1) *
+		(ebx.split.coherency_line_size + 1) *
+		(ebx.split.physical_line_partition + 1) *
+		(ebx.split.ways_of_associativity + 1);
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 1cb3e21c571a..4b7e02216970 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -976,6 +976,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
 
+	/* Fix cpuid4 emulation for more */
+	num_cache_leaves = 3;
+
 	return r;
 }
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 0c83cf12eec9..b796210c0f5c 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -112,6 +112,7 @@ extern char ignore_fpu_irq;
 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
 
 #ifdef CONFIG_X86_HT
 extern void detect_ht(struct cpuinfo_x86 *c);
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 3061a38a3b1d..e583f0d95209 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -96,6 +96,7 @@ extern char ignore_irq13;
 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
 
 /*
  * EFLAGS bits
-- 
cgit v1.2.3


From d167a51877e94dda73dd656c51f363502309f713 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@suse.de>
Date: Mon, 26 Jun 2006 13:56:16 +0200
Subject: [PATCH] x86_64: x86_64 version of the smp alternative patch.

Changes are largely identical to the i386 version:

 * alternative #define are moved to the new alternative.h file.
 * one new elf section with pointers to the lock prefixes which can be
   nop'ed out for non-smp.
 * two new elf sections simliar to the "classic" alternatives to
   replace SMP code with simpler UP code.
 * fixup headers to use alternative.h instead of defining their own
   LOCK / LOCK_PREFIX macros.

The patch reuses the i386 version of the alternatives code to avoid code
duplication.  The code in alternatives.c was shuffled around a bit to
reduce the number of #ifdefs needed.  It also got some tweaks needed for
x86_64 (vsyscall page handling) and new features (noreplacement option
which was x86_64 only up to now).  Debug printk's are changed from
compile-time to runtime.

Loosely based on a early version from Bastian Blank <waldi@debian.org>

Signed-off-by: Gerd Hoffmann <kraxel@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/alternative.c   | 118 ++++++++++++++++++++++---------
 arch/x86_64/kernel/Makefile      |   4 +-
 arch/x86_64/kernel/module.c      |  38 ++++++----
 arch/x86_64/kernel/setup.c       |  76 +-------------------
 arch/x86_64/kernel/smpboot.c     |   4 ++
 arch/x86_64/kernel/vmlinux.lds.S |  20 ++++++
 arch/x86_64/mm/init.c            |  27 ++++----
 include/asm-i386/alternative.h   |   2 +
 include/asm-x86_64/alternative.h | 146 +++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/atomic.h      |  42 +++++------
 include/asm-x86_64/bitops.h      |   7 +-
 include/asm-x86_64/cpufeature.h  |   2 +
 include/asm-x86_64/mutex.h       |   4 +-
 include/asm-x86_64/rwlock.h      |   8 +--
 include/asm-x86_64/semaphore.h   |   8 +--
 include/asm-x86_64/spinlock.h    |  10 ++-
 include/asm-x86_64/system.h      |  86 +----------------------
 17 files changed, 344 insertions(+), 258 deletions(-)
 create mode 100644 include/asm-x86_64/alternative.h

(limited to 'include')

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 5cbd6f99fb2a..50eb0e03777e 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -4,27 +4,41 @@
 #include <asm/alternative.h>
 #include <asm/sections.h>
 
-#define DEBUG 0
-#if DEBUG
-# define DPRINTK(fmt, args...) printk(fmt, args)
-#else
-# define DPRINTK(fmt, args...)
-#endif
+static int no_replacement    = 0;
+static int smp_alt_once      = 0;
+static int debug_alternative = 0;
+
+static int __init noreplacement_setup(char *s)
+{
+	no_replacement = 1;
+	return 1;
+}
+static int __init bootonly(char *str)
+{
+	smp_alt_once = 1;
+	return 1;
+}
+static int __init debug_alt(char *str)
+{
+	debug_alternative = 1;
+	return 1;
+}
 
+__setup("noreplacement", noreplacement_setup);
+__setup("smp-alt-boot", bootonly);
+__setup("debug-alternative", debug_alt);
+
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+	printk(KERN_DEBUG fmt, args)
+
+#ifdef GENERIC_NOP1
 /* Use inline assembly to define this because the nops are defined
    as inline assembly strings in the include files and we cannot
    get them easily into strings. */
 asm("\t.data\nintelnops: "
 	GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
 	GENERIC_NOP7 GENERIC_NOP8);
-asm("\t.data\nk8nops: "
-	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-	K8_NOP7 K8_NOP8);
-asm("\t.data\nk7nops: "
-	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
-	K7_NOP7 K7_NOP8);
-
-extern unsigned char intelnops[], k8nops[], k7nops[];
+extern unsigned char intelnops[];
 static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
 	NULL,
 	intelnops,
@@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 };
+#endif
+
+#ifdef K8_NOP1
+asm("\t.data\nk8nops: "
+	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+	K8_NOP7 K8_NOP8);
+extern unsigned char k8nops[];
 static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
 	NULL,
 	k8nops,
@@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 };
+#endif
+
+#ifdef K7_NOP1
+asm("\t.data\nk7nops: "
+	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+	K7_NOP7 K7_NOP8);
+extern unsigned char k7nops[];
 static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
 	NULL,
 	k7nops,
@@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 };
+#endif
+
+#ifdef CONFIG_X86_64
+
+extern char __vsyscall_0;
+static inline unsigned char** find_nop_table(void)
+{
+	return k8_nops;
+}
+
+#else /* CONFIG_X86_64 */
+
 static struct nop {
 	int cpuid;
 	unsigned char **noptable;
@@ -67,14 +107,6 @@ static struct nop {
 	{ -1, NULL }
 };
 
-
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
-
-extern u8 __smp_alt_begin[], __smp_alt_end[];
-
-
 static unsigned char** find_nop_table(void)
 {
 	unsigned char **noptable = intel_nops;
@@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void)
 	return noptable;
 }
 
+#endif /* CONFIG_X86_64 */
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
+extern u8 *__smp_locks[], *__smp_locks_end[];
+
+extern u8 __smp_alt_begin[], __smp_alt_end[];
+
 /* Replace instructions with better alternatives for this CPU type.
    This runs before SMP is initialized to avoid SMP problems with
    self modifying code. This implies that assymetric systems where
@@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
 	unsigned char **noptable = find_nop_table();
 	struct alt_instr *a;
+	u8 *instr;
 	int diff, i, k;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
@@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 		BUG_ON(a->replacementlen > a->instrlen);
 		if (!boot_cpu_has(a->cpuid))
 			continue;
-		memcpy(a->instr, a->replacement, a->replacementlen);
+		instr = a->instr;
+#ifdef CONFIG_X86_64
+		/* vsyscall code is not mapped yet. resolve it manually. */
+		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
+			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+			DPRINTK("%s: vsyscall fixup: %p => %p\n",
+				__FUNCTION__, a->instr, instr);
+		}
+#endif
+		memcpy(instr, a->replacement, a->replacementlen);
 		diff = a->instrlen - a->replacementlen;
 		/* Pad the rest with nops */
 		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
@@ -186,14 +236,6 @@ struct smp_alt_module {
 static LIST_HEAD(smp_alt_modules);
 static DEFINE_SPINLOCK(smp_alt);
 
-static int smp_alt_once = 0;
-static int __init bootonly(char *str)
-{
-	smp_alt_once = 1;
-	return 1;
-}
-__setup("smp-alt-boot", bootonly);
-
 void alternatives_smp_module_add(struct module *mod, char *name,
 				 void *locks, void *locks_end,
 				 void *text,  void *text_end)
@@ -201,6 +243,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
 	struct smp_alt_module *smp;
 	unsigned long flags;
 
+	if (no_replacement)
+		return;
+
 	if (smp_alt_once) {
 		if (boot_cpu_has(X86_FEATURE_UP))
 			alternatives_smp_unlock(locks, locks_end,
@@ -235,7 +280,7 @@ void alternatives_smp_module_del(struct module *mod)
 	struct smp_alt_module *item;
 	unsigned long flags;
 
-	if (smp_alt_once)
+	if (no_replacement || smp_alt_once)
 		return;
 
 	spin_lock_irqsave(&smp_alt, flags);
@@ -256,7 +301,7 @@ void alternatives_smp_switch(int smp)
 	struct smp_alt_module *mod;
 	unsigned long flags;
 
-	if (smp_alt_once)
+	if (no_replacement || smp_alt_once)
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
@@ -285,6 +330,13 @@ void alternatives_smp_switch(int smp)
 
 void __init alternative_instructions(void)
 {
+	if (no_replacement) {
+		printk(KERN_INFO "(SMP-)alternatives turned off\n");
+		free_init_pages("SMP alternatives",
+				(unsigned long)__smp_alt_begin,
+				(unsigned long)__smp_alt_end);
+		return;
+	}
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
 	/* switch to patch-once-at-boottime-only mode and free the
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 059c88313f4e..381bc6ad743e 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		pci-dma.o pci-nommu.o
+		pci-dma.o pci-nommu.o alternative.o
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -49,3 +49,5 @@ intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
 i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
+alternative-y			+= ../../i386/kernel/alternative.o
+
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
index bac195c74bcc..9d0958ff547f 100644
--- a/arch/x86_64/kernel/module.c
+++ b/arch/x86_64/kernel/module.c
@@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs,
 	return -ENOSYS;
 } 
 
-extern void apply_alternatives(void *start, void *end); 
-
 int module_finalize(const Elf_Ehdr *hdr,
-		    const Elf_Shdr *sechdrs,
-		    struct module *me)
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
 {
-	const Elf_Shdr *s;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
-	/* look for .altinstructions to patch */ 
-	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
-		void *seg; 		
-		if (strcmp(".altinstructions", secstrings + s->sh_name))
-			continue;
-		seg = (void *)s->sh_addr; 
-		apply_alternatives(seg, seg + s->sh_size); 
-	} 	
+	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+		if (!strcmp(".text", secstrings + s->sh_name))
+			text = s;
+		if (!strcmp(".altinstructions", secstrings + s->sh_name))
+			alt = s;
+		if (!strcmp(".smp_locks", secstrings + s->sh_name))
+			locks= s;
+	}
+
+	if (alt) {
+		/* patch .altinstructions */
+		void *aseg = (void *)alt->sh_addr;
+		apply_alternatives(aseg, aseg + alt->sh_size);
+	}
+	if (locks && text) {
+		void *lseg = (void *)locks->sh_addr;
+		void *tseg = (void *)text->sh_addr;
+		alternatives_smp_module_add(me, me->name,
+					    lseg, lseg + locks->sh_size,
+					    tseg, tseg + text->sh_size);
+	}
 	return 0;
 }
 
 void module_arch_cleanup(struct module *mod)
 {
+	alternatives_smp_module_del(mod);
 }
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 4b7e02216970..64640c8f5eed 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -473,80 +473,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 } 
 #endif
 
-/* Use inline assembly to define this because the nops are defined 
-   as inline assembly strings in the include files and we cannot 
-   get them easily into strings. */
-asm("\t.data\nk8nops: " 
-    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-    K8_NOP7 K8_NOP8); 
-    
-extern unsigned char k8nops[];
-static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
-     NULL,
-     k8nops,
-     k8nops + 1,
-     k8nops + 1 + 2,
-     k8nops + 1 + 2 + 3,
-     k8nops + 1 + 2 + 3 + 4,
-     k8nops + 1 + 2 + 3 + 4 + 5,
-     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
-     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-}; 
-
-extern char __vsyscall_0;
-
-/* Replace instructions with better alternatives for this CPU type.
-
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that assymetric systems where
-   APs have less capabilities than the boot processor are not handled. 
-   In this case boot with "noreplacement". */ 
-void apply_alternatives(void *start, void *end) 
-{ 
-	struct alt_instr *a; 
-	int diff, i, k;
-	for (a = start; (void *)a < end; a++) { 
-		u8 *instr;
-
-		if (!boot_cpu_has(a->cpuid))
-			continue;
-
-		BUG_ON(a->replacementlen > a->instrlen); 
-		instr = a->instr;
-		/* vsyscall code is not mapped yet. resolve it manually. */
-		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
-			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
-		__inline_memcpy(instr, a->replacement, a->replacementlen);
-		diff = a->instrlen - a->replacementlen; 
-
-		/* Pad the rest with nops */
-		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			k = diff;
-			if (k > ASM_NOP_MAX)
-				k = ASM_NOP_MAX;
-			__inline_memcpy(instr + i, k8_nops[k], k);
-		} 
-	}
-} 
-
-static int no_replacement __initdata = 0; 
- 
-void __init alternative_instructions(void)
-{
-	extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-	if (no_replacement) 
-		return;
-	apply_alternatives(__alt_instructions, __alt_instructions_end);
-}
-
-static int __init noreplacement_setup(char *s)
-{ 
-     no_replacement = 1; 
-     return 1;
-} 
-
-__setup("noreplacement", noreplacement_setup); 
-
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
 #ifdef CONFIG_EDD_MODULE
@@ -1303,7 +1229,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		/* Other (Linux-defined) */
 		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
 		"constant_tsc", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 71a7222cf9ce..06535e7687ce 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -797,6 +797,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	}
 
 
+	alternatives_smp_switch(1);
+
 	c_idle.idle = get_idle_for_cpu(cpu);
 
 	if (c_idle.idle) {
@@ -1259,6 +1261,8 @@ void __cpu_die(unsigned int cpu)
 		/* They ack this in play_dead by setting CPU_DEAD */
 		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
 			printk ("CPU %d is now offline\n", cpu);
+			if (1 == num_online_cpus())
+				alternatives_smp_switch(0);
 			return;
 		}
 		msleep(100);
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index b81f473c4a19..5968c2415da9 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -131,6 +131,26 @@ SECTIONS
 	*(.data.page_aligned)
   }
 
+  /* might get freed after init */
+  . = ALIGN(4096);
+  __smp_alt_begin = .;
+  __smp_alt_instructions = .;
+  .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
+	*(.smp_altinstructions)
+  }
+  __smp_alt_instructions_end = .;
+  . = ALIGN(8);
+  __smp_locks = .;
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+	*(.smp_locks)
+  }
+  __smp_locks_end = .;
+  .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
+	*(.smp_altinstr_replacement)
+  }
+  . = ALIGN(4096);
+  __smp_alt_end = .;
+
   . = ALIGN(4096);		/* Init code and data */
   __init_begin = .;
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4ba34e95d835..a70a8e3e312d 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -644,20 +644,29 @@ void __init mem_init(void)
 #endif
 }
 
-void free_initmem(void)
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
 	unsigned long addr;
 
-	addr = (unsigned long)(&__init_begin);
-	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+	if (begin >= end)
+		return;
+
+	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 		free_page(addr);
 		totalram_pages++;
 	}
+}
+
+void free_initmem(void)
+{
 	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
-	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
+	free_init_pages("unused kernel memory",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
 }
 
 #ifdef CONFIG_DEBUG_RODATA
@@ -686,15 +695,7 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	if (start >= end)
-		return;
-	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-	for (; start < end; start += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		totalram_pages++;
-	}
+	free_init_pages("initrd memory", start, end);
 }
 #endif
 
diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h
index d79e9ee10fd7..c61bd1a17f37 100644
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -5,6 +5,8 @@
 
 #include <asm/types.h>
 
+#include <linux/types.h>
+
 struct alt_instr {
 	u8 *instr; 		/* original instruction */
 	u8 *replacement;
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
new file mode 100644
index 000000000000..387c8f66af7d
--- /dev/null
+++ b/include/asm-x86_64/alternative.h
@@ -0,0 +1,146 @@
+#ifndef _X86_64_ALTERNATIVE_H
+#define _X86_64_ALTERNATIVE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+
+struct alt_instr {
+	u8 *instr; 		/* original instruction */
+	u8 *replacement;
+	u8  cpuid;		/* cpuid bit set for replacement */
+	u8  instrlen;		/* length of original instruction */
+	u8  replacementlen; 	/* length of new instruction, <= instrlen */
+	u8  pad[5];
+};
+
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+struct module;
+extern void alternatives_smp_module_add(struct module *mod, char *name,
+					void *locks, void *locks_end,
+					void *text, void *text_end);
+extern void alternatives_smp_module_del(struct module *mod);
+extern void alternatives_smp_switch(int smp);
+
+#endif
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature) 	\
+	asm volatile ("661:\n\t" oldinstr "\n662:\n" 		     \
+		      ".section .altinstructions,\"a\"\n"     	     \
+		      "  .align 8\n"				       \
+		      "  .quad 661b\n"            /* label */          \
+		      "  .quad 663f\n"		  /* new instruction */ \
+		      "  .byte %c0\n"             /* feature bit */    \
+		      "  .byte 662b-661b\n"       /* sourcelen */      \
+		      "  .byte 664f-663f\n"       /* replacementlen */ \
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" :: "i" (feature) : "memory")
+
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...)	\
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" :: "i" (feature), ##input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .byte %c[feat]\n"        /* feature bit */	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" : output : [feat] "i" (feature), ##input)
+
+/*
+ * Alternative inline assembly for SMP.
+ *
+ * alternative_smp() takes two versions (SMP first, UP second) and is
+ * for more complex stuff such as spinlocks.
+ *
+ * The LOCK_PREFIX macro defined here replaces the LOCK and
+ * LOCK_PREFIX macros used everywhere in the source tree.
+ *
+ * SMP alternatives use the same data structures as the other
+ * alternatives and the X86_FEATURE_UP flag to indicate the case of a
+ * UP system running a SMP kernel.  The existing apply_alternatives()
+ * works fine for patching a SMP kernel for UP.
+ *
+ * The SMP alternative tables can be kept after boot and contain both
+ * UP and SMP versions of the instructions to allow switching back to
+ * SMP at runtime, when hotplugging in a new CPU, which is especially
+ * useful in virtualized environments.
+ *
+ * The very common lock prefix is handled as special case in a
+ * separate table which is a pure address list without replacement ptr
+ * and size information.  That keeps the table sizes small.
+ */
+
+#ifdef CONFIG_SMP
+#define alternative_smp(smpinstr, upinstr, args...)			\
+	asm volatile ("661:\n\t" smpinstr "\n662:\n"			\
+		      ".section .smp_altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .byte 0x66\n"            /* X86_FEATURE_UP */	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      ".previous\n"					\
+		      ".section .smp_altinstr_replacement,\"awx\"\n"	\
+		      "663:\n\t" upinstr "\n"     /* replacement */	\
+		      "664:\n\t.fill 662b-661b,1,0x42\n" /* space for original */ \
+		      ".previous" : args)
+
+#define LOCK_PREFIX \
+		".section .smp_locks,\"a\"\n"	\
+		"  .align 8\n"			\
+		"  .quad 661f\n" /* address */	\
+		".previous\n"			\
+	       	"661:\n\tlock; "
+
+#else /* ! CONFIG_SMP */
+#define alternative_smp(smpinstr, upinstr, args...) \
+	asm volatile (upinstr : args)
+#define LOCK_PREFIX ""
+#endif
+
+#endif /* _X86_64_ALTERNATIVE_H */
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index bd3fa67ed835..007e88d6d43f 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -1,7 +1,7 @@
 #ifndef __ARCH_X86_64_ATOMIC__
 #define __ARCH_X86_64_ATOMIC__
 
-#include <asm/types.h>
+#include <asm/alternative.h>
 
 /* atomic_t should be 32 bit signed type */
 
@@ -52,7 +52,7 @@ typedef struct { volatile int counter; } atomic_t;
 static __inline__ void atomic_add(int i, atomic_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "addl %1,%0"
+		LOCK_PREFIX "addl %1,%0"
 		:"=m" (v->counter)
 		:"ir" (i), "m" (v->counter));
 }
@@ -67,7 +67,7 @@ static __inline__ void atomic_add(int i, atomic_t *v)
 static __inline__ void atomic_sub(int i, atomic_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "subl %1,%0"
+		LOCK_PREFIX "subl %1,%0"
 		:"=m" (v->counter)
 		:"ir" (i), "m" (v->counter));
 }
@@ -86,7 +86,7 @@ static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "subl %2,%0; sete %1"
+		LOCK_PREFIX "subl %2,%0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
@@ -101,7 +101,7 @@ static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
 static __inline__ void atomic_inc(atomic_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "incl %0"
+		LOCK_PREFIX "incl %0"
 		:"=m" (v->counter)
 		:"m" (v->counter));
 }
@@ -115,7 +115,7 @@ static __inline__ void atomic_inc(atomic_t *v)
 static __inline__ void atomic_dec(atomic_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "decl %0"
+		LOCK_PREFIX "decl %0"
 		:"=m" (v->counter)
 		:"m" (v->counter));
 }
@@ -133,7 +133,7 @@ static __inline__ int atomic_dec_and_test(atomic_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "decl %0; sete %1"
+		LOCK_PREFIX "decl %0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"m" (v->counter) : "memory");
 	return c != 0;
@@ -152,7 +152,7 @@ static __inline__ int atomic_inc_and_test(atomic_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "incl %0; sete %1"
+		LOCK_PREFIX "incl %0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"m" (v->counter) : "memory");
 	return c != 0;
@@ -172,7 +172,7 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "addl %2,%0; sets %1"
+		LOCK_PREFIX "addl %2,%0; sets %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
@@ -189,7 +189,7 @@ static __inline__ int atomic_add_return(int i, atomic_t *v)
 {
 	int __i = i;
 	__asm__ __volatile__(
-		LOCK "xaddl %0, %1;"
+		LOCK_PREFIX "xaddl %0, %1;"
 		:"=r"(i)
 		:"m"(v->counter), "0"(i));
 	return i + __i;
@@ -237,7 +237,7 @@ typedef struct { volatile long counter; } atomic64_t;
 static __inline__ void atomic64_add(long i, atomic64_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "addq %1,%0"
+		LOCK_PREFIX "addq %1,%0"
 		:"=m" (v->counter)
 		:"ir" (i), "m" (v->counter));
 }
@@ -252,7 +252,7 @@ static __inline__ void atomic64_add(long i, atomic64_t *v)
 static __inline__ void atomic64_sub(long i, atomic64_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "subq %1,%0"
+		LOCK_PREFIX "subq %1,%0"
 		:"=m" (v->counter)
 		:"ir" (i), "m" (v->counter));
 }
@@ -271,7 +271,7 @@ static __inline__ int atomic64_sub_and_test(long i, atomic64_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "subq %2,%0; sete %1"
+		LOCK_PREFIX "subq %2,%0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
@@ -286,7 +286,7 @@ static __inline__ int atomic64_sub_and_test(long i, atomic64_t *v)
 static __inline__ void atomic64_inc(atomic64_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "incq %0"
+		LOCK_PREFIX "incq %0"
 		:"=m" (v->counter)
 		:"m" (v->counter));
 }
@@ -300,7 +300,7 @@ static __inline__ void atomic64_inc(atomic64_t *v)
 static __inline__ void atomic64_dec(atomic64_t *v)
 {
 	__asm__ __volatile__(
-		LOCK "decq %0"
+		LOCK_PREFIX "decq %0"
 		:"=m" (v->counter)
 		:"m" (v->counter));
 }
@@ -318,7 +318,7 @@ static __inline__ int atomic64_dec_and_test(atomic64_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "decq %0; sete %1"
+		LOCK_PREFIX "decq %0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"m" (v->counter) : "memory");
 	return c != 0;
@@ -337,7 +337,7 @@ static __inline__ int atomic64_inc_and_test(atomic64_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "incq %0; sete %1"
+		LOCK_PREFIX "incq %0; sete %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"m" (v->counter) : "memory");
 	return c != 0;
@@ -357,7 +357,7 @@ static __inline__ int atomic64_add_negative(long i, atomic64_t *v)
 	unsigned char c;
 
 	__asm__ __volatile__(
-		LOCK "addq %2,%0; sets %1"
+		LOCK_PREFIX "addq %2,%0; sets %1"
 		:"=m" (v->counter), "=qm" (c)
 		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
@@ -374,7 +374,7 @@ static __inline__ long atomic64_add_return(long i, atomic64_t *v)
 {
 	long __i = i;
 	__asm__ __volatile__(
-		LOCK "xaddq %0, %1;"
+		LOCK_PREFIX "xaddq %0, %1;"
 		:"=r"(i)
 		:"m"(v->counter), "0"(i));
 	return i + __i;
@@ -418,11 +418,11 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t *v)
 
 /* These are x86-specific, used by some header files */
 #define atomic_clear_mask(mask, addr) \
-__asm__ __volatile__(LOCK "andl %0,%1" \
+__asm__ __volatile__(LOCK_PREFIX "andl %0,%1" \
 : : "r" (~(mask)),"m" (*addr) : "memory")
 
 #define atomic_set_mask(mask, addr) \
-__asm__ __volatile__(LOCK "orl %0,%1" \
+__asm__ __volatile__(LOCK_PREFIX "orl %0,%1" \
 : : "r" ((unsigned)mask),"m" (*(addr)) : "memory")
 
 /* Atomic operations are already serializing on x86 */
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index e9bf933d25d0..f7ba57b1cc08 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -5,12 +5,7 @@
  * Copyright 1992, Linus Torvalds.
  */
 
-
-#ifdef CONFIG_SMP
-#define LOCK_PREFIX "lock ; "
-#else
-#define LOCK_PREFIX ""
-#endif
+#include <asm/alternative.h>
 
 #define ADDR (*(volatile long *) addr)
 
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index 662964b74e34..afc44e557400 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -65,6 +65,8 @@
 #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */
 #define X86_FEATURE_SYNC_RDTSC  (3*32+6)  /* RDTSC syncs CPU core */
 #define X86_FEATURE_FXSAVE_LEAK (3*32+7)  /* FIP/FOP/FDP leaks through FXSAVE */
+#define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
+
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h
index 11fbee2bd6c0..06fab6de2a88 100644
--- a/include/asm-x86_64/mutex.h
+++ b/include/asm-x86_64/mutex.h
@@ -24,7 +24,7 @@ do {									\
 	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
 									\
 	__asm__ __volatile__(						\
-		LOCK	"   decl (%%rdi)	\n"			\
+		LOCK_PREFIX "   decl (%%rdi)	\n"			\
 			"   js 2f		\n"			\
 			"1:			\n"			\
 									\
@@ -74,7 +74,7 @@ do {									\
 	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
 									\
 	__asm__ __volatile__(						\
-		LOCK	"   incl (%%rdi)	\n"			\
+		LOCK_PREFIX "   incl (%%rdi)	\n"			\
 			"   jle 2f		\n"			\
 			"1:			\n"			\
 									\
diff --git a/include/asm-x86_64/rwlock.h b/include/asm-x86_64/rwlock.h
index 9942cc393064..dea0e9459264 100644
--- a/include/asm-x86_64/rwlock.h
+++ b/include/asm-x86_64/rwlock.h
@@ -24,7 +24,7 @@
 #define RW_LOCK_BIAS_STR	"0x01000000"
 
 #define __build_read_lock_ptr(rw, helper)   \
-	asm volatile(LOCK "subl $1,(%0)\n\t" \
+	asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" \
 		     "js 2f\n" \
 		     "1:\n" \
 		    LOCK_SECTION_START("") \
@@ -34,7 +34,7 @@
 		     ::"a" (rw) : "memory")
 
 #define __build_read_lock_const(rw, helper)   \
-	asm volatile(LOCK "subl $1,%0\n\t" \
+	asm volatile(LOCK_PREFIX "subl $1,%0\n\t" \
 		     "js 2f\n" \
 		     "1:\n" \
 		    LOCK_SECTION_START("") \
@@ -54,7 +54,7 @@
 					} while (0)
 
 #define __build_write_lock_ptr(rw, helper) \
-	asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \
+	asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \
 		     "jnz 2f\n" \
 		     "1:\n" \
 		     LOCK_SECTION_START("") \
@@ -64,7 +64,7 @@
 		     ::"a" (rw) : "memory")
 
 #define __build_write_lock_const(rw, helper) \
-	asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \
+	asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \
 		     "jnz 2f\n" \
 		     "1:\n" \
 		    LOCK_SECTION_START("") \
diff --git a/include/asm-x86_64/semaphore.h b/include/asm-x86_64/semaphore.h
index a389aa6fe80f..064df08b9a0f 100644
--- a/include/asm-x86_64/semaphore.h
+++ b/include/asm-x86_64/semaphore.h
@@ -106,7 +106,7 @@ static inline void down(struct semaphore * sem)
 
 	__asm__ __volatile__(
 		"# atomic down operation\n\t"
-		LOCK "decl %0\n\t"     /* --sem->count */
+		LOCK_PREFIX "decl %0\n\t"     /* --sem->count */
 		"js 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
@@ -130,7 +130,7 @@ static inline int down_interruptible(struct semaphore * sem)
 
 	__asm__ __volatile__(
 		"# atomic interruptible down operation\n\t"
-		LOCK "decl %1\n\t"     /* --sem->count */
+		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"js 2f\n\t"
 		"xorl %0,%0\n"
 		"1:\n"
@@ -154,7 +154,7 @@ static inline int down_trylock(struct semaphore * sem)
 
 	__asm__ __volatile__(
 		"# atomic interruptible down operation\n\t"
-		LOCK "decl %1\n\t"     /* --sem->count */
+		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"js 2f\n\t"
 		"xorl %0,%0\n"
 		"1:\n"
@@ -178,7 +178,7 @@ static inline void up(struct semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
-		LOCK "incl %0\n\t"     /* ++sem->count */
+		LOCK_PREFIX "incl %0\n\t"     /* ++sem->count */
 		"jle 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 5d8a5e3589ff..8d3421996f94 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -31,15 +31,19 @@
 	"jmp 1b\n" \
 	LOCK_SECTION_END
 
+#define __raw_spin_lock_string_up \
+	"\n\tdecl %0"
+
 #define __raw_spin_unlock_string \
 	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
-	__asm__ __volatile__(
-		__raw_spin_lock_string
-		:"=m" (lock->slock) : : "memory");
+	alternative_smp(
+		__raw_spin_lock_string,
+		__raw_spin_lock_string_up,
+		"=m" (lock->slock) : : "memory");
 }
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index f48e0dad8b3d..68e559f3631c 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -3,15 +3,10 @@
 
 #include <linux/kernel.h>
 #include <asm/segment.h>
+#include <asm/alternative.h>
 
 #ifdef __KERNEL__
 
-#ifdef CONFIG_SMP
-#define LOCK_PREFIX "lock ; "
-#else
-#define LOCK_PREFIX ""
-#endif
-
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -34,7 +29,7 @@
 		     "thread_return:\n\t"					    \
 		     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
 		     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
-		     LOCK "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"		  \
+		     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
 		     "movq %%rax,%%rdi\n\t" 					  \
 		     "jc   ret_from_fork\n\t"					  \
 		     RESTORE_CONTEXT						    \
@@ -69,82 +64,6 @@ extern void load_gs_index(unsigned);
 		".previous"			\
 		: :"r" (value), "r" (0))
 
-#ifdef __KERNEL__
-struct alt_instr { 
-	__u8 *instr; 		/* original instruction */
-	__u8 *replacement;
-	__u8  cpuid;		/* cpuid bit set for replacement */
-	__u8  instrlen;		/* length of original instruction */
-	__u8  replacementlen; 	/* length of new instruction, <= instrlen */ 
-	__u8  pad[5];
-}; 
-#endif
-
-/*
- * Alternative instructions for different CPU types or capabilities.
- * 
- * This allows to use optimized instructions even on generic binary
- * kernels.
- * 
- * length of oldinstr must be longer or equal the length of newinstr
- * It can be padded with nops as needed.
- * 
- * For non barrier like inlines please define new variants
- * without volatile and memory clobber.
- */
-#define alternative(oldinstr, newinstr, feature) 	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n" 		     \
-		      ".section .altinstructions,\"a\"\n"     	     \
-		      "  .align 8\n"				       \
-		      "  .quad 661b\n"            /* label */          \
-		      "  .quad 663f\n"		  /* new instruction */ \
-		      "  .byte %c0\n"             /* feature bit */    \
-		      "  .byte 662b-661b\n"       /* sourcelen */      \
-		      "  .byte 664f-663f\n"       /* replacementlen */ \
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-		      ".previous" :: "i" (feature) : "memory")  
-
-/*
- * Alternative inline assembly with input.
- * 
- * Peculiarities:
- * No memory clobber here. 
- * Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the 
- * replacement make sure to pad to the worst case length.
- */
-#define alternative_input(oldinstr, newinstr, feature, input...)	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      "  .align 8\n"					\
-		      "  .quad 661b\n"            /* label */		\
-		      "  .quad 663f\n"		  /* new instruction */	\
-		      "  .byte %c0\n"             /* feature bit */	\
-		      "  .byte 662b-661b\n"       /* sourcelen */	\
-		      "  .byte 664f-663f\n"       /* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-		      ".previous" :: "i" (feature), ##input)
-
-/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      "  .align 8\n"					\
-		      "  .quad 661b\n"            /* label */		\
-		      "  .quad 663f\n"		  /* new instruction */	\
-		      "  .byte %c[feat]\n"        /* feature bit */	\
-		      "  .byte 662b-661b\n"       /* sourcelen */	\
-		      "  .byte 664f-663f\n"       /* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-		      ".previous" : output : [feat] "i" (feature), ##input)
-
 /*
  * Clear and set 'TS' bit respectively
  */
@@ -366,5 +285,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 void cpu_idle_wait(void);
 
 extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
 #endif
-- 
cgit v1.2.3


From 0c90bb87730613709e65c03c86d614e31a675d4f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:22 +0200
Subject: [PATCH] x86_64: Remove most of ia32_unistd.h

It's only needed for three system calls, no need to maintain
a full list forever.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/ia32_unistd.h | 308 +--------------------------------------
 1 file changed, 3 insertions(+), 305 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index b4f4b172b15a..5b52ce507338 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -4,317 +4,15 @@
 /*
  * This file contains the system call numbers of the ia32 port,
  * this is for the kernel only.
+ * Only add syscalls here where some part of the kernel needs to know
+ * the number. This should be otherwise in sync with asm-i386/unistd.h. -AK
  */
 
 #define __NR_ia32_restart_syscall 0
 #define __NR_ia32_exit		  1
-#define __NR_ia32_fork		  2
 #define __NR_ia32_read		  3
 #define __NR_ia32_write		  4
-#define __NR_ia32_open		  5
-#define __NR_ia32_close		  6
-#define __NR_ia32_waitpid		  7
-#define __NR_ia32_creat		  8
-#define __NR_ia32_link		  9
-#define __NR_ia32_unlink		 10
-#define __NR_ia32_execve		 11
-#define __NR_ia32_chdir		 12
-#define __NR_ia32_time		 13
-#define __NR_ia32_mknod		 14
-#define __NR_ia32_chmod		 15
-#define __NR_ia32_lchown		 16
-#define __NR_ia32_break		 17
-#define __NR_ia32_oldstat		 18
-#define __NR_ia32_lseek		 19
-#define __NR_ia32_getpid		 20
-#define __NR_ia32_mount		 21
-#define __NR_ia32_umount		 22
-#define __NR_ia32_setuid		 23
-#define __NR_ia32_getuid		 24
-#define __NR_ia32_stime		 25
-#define __NR_ia32_ptrace		 26
-#define __NR_ia32_alarm		 27
-#define __NR_ia32_oldfstat		 28
-#define __NR_ia32_pause		 29
-#define __NR_ia32_utime		 30
-#define __NR_ia32_stty		 31
-#define __NR_ia32_gtty		 32
-#define __NR_ia32_access		 33
-#define __NR_ia32_nice		 34
-#define __NR_ia32_ftime		 35
-#define __NR_ia32_sync		 36
-#define __NR_ia32_kill		 37
-#define __NR_ia32_rename		 38
-#define __NR_ia32_mkdir		 39
-#define __NR_ia32_rmdir		 40
-#define __NR_ia32_dup		 41
-#define __NR_ia32_pipe		 42
-#define __NR_ia32_times		 43
-#define __NR_ia32_prof		 44
-#define __NR_ia32_brk		 45
-#define __NR_ia32_setgid		 46
-#define __NR_ia32_getgid		 47
-#define __NR_ia32_signal		 48
-#define __NR_ia32_geteuid		 49
-#define __NR_ia32_getegid		 50
-#define __NR_ia32_acct		 51
-#define __NR_ia32_umount2		 52
-#define __NR_ia32_lock		 53
-#define __NR_ia32_ioctl		 54
-#define __NR_ia32_fcntl		 55
-#define __NR_ia32_mpx		 56
-#define __NR_ia32_setpgid		 57
-#define __NR_ia32_ulimit		 58
-#define __NR_ia32_oldolduname	 59
-#define __NR_ia32_umask		 60
-#define __NR_ia32_chroot		 61
-#define __NR_ia32_ustat		 62
-#define __NR_ia32_dup2		 63
-#define __NR_ia32_getppid		 64
-#define __NR_ia32_getpgrp		 65
-#define __NR_ia32_setsid		 66
-#define __NR_ia32_sigaction		 67
-#define __NR_ia32_sgetmask		 68
-#define __NR_ia32_ssetmask		 69
-#define __NR_ia32_setreuid		 70
-#define __NR_ia32_setregid		 71
-#define __NR_ia32_sigsuspend		 72
-#define __NR_ia32_sigpending		 73
-#define __NR_ia32_sethostname	 74
-#define __NR_ia32_setrlimit		 75
-#define __NR_ia32_getrlimit		 76	/* Back compatible 2Gig limited rlimit */
-#define __NR_ia32_getrusage		 77
-#define __NR_ia32_gettimeofday	 78
-#define __NR_ia32_settimeofday	 79
-#define __NR_ia32_getgroups		 80
-#define __NR_ia32_setgroups		 81
-#define __NR_ia32_select		 82
-#define __NR_ia32_symlink		 83
-#define __NR_ia32_oldlstat		 84
-#define __NR_ia32_readlink		 85
-#define __NR_ia32_uselib		 86
-#define __NR_ia32_swapon		 87
-#define __NR_ia32_reboot		 88
-#define __NR_ia32_readdir		 89
-#define __NR_ia32_mmap		 90
-#define __NR_ia32_munmap		 91
-#define __NR_ia32_truncate		 92
-#define __NR_ia32_ftruncate		 93
-#define __NR_ia32_fchmod		 94
-#define __NR_ia32_fchown		 95
-#define __NR_ia32_getpriority	 96
-#define __NR_ia32_setpriority	 97
-#define __NR_ia32_profil		 98
-#define __NR_ia32_statfs		 99
-#define __NR_ia32_fstatfs		100
-#define __NR_ia32_ioperm		101
-#define __NR_ia32_socketcall		102
-#define __NR_ia32_syslog		103
-#define __NR_ia32_setitimer		104
-#define __NR_ia32_getitimer		105
-#define __NR_ia32_stat		106
-#define __NR_ia32_lstat		107
-#define __NR_ia32_fstat		108
-#define __NR_ia32_olduname		109
-#define __NR_ia32_iopl		110
-#define __NR_ia32_vhangup		111
-#define __NR_ia32_idle		112
-#define __NR_ia32_vm86old		113
-#define __NR_ia32_wait4		114
-#define __NR_ia32_swapoff		115
-#define __NR_ia32_sysinfo		116
-#define __NR_ia32_ipc		117
-#define __NR_ia32_fsync		118
-#define __NR_ia32_sigreturn		119
-#define __NR_ia32_clone		120
-#define __NR_ia32_setdomainname	121
-#define __NR_ia32_uname		122
-#define __NR_ia32_modify_ldt		123
-#define __NR_ia32_adjtimex		124
-#define __NR_ia32_mprotect		125
-#define __NR_ia32_sigprocmask	126
-#define __NR_ia32_create_module	127
-#define __NR_ia32_init_module	128
-#define __NR_ia32_delete_module	129
-#define __NR_ia32_get_kernel_syms	130
-#define __NR_ia32_quotactl		131
-#define __NR_ia32_getpgid		132
-#define __NR_ia32_fchdir		133
-#define __NR_ia32_bdflush		134
-#define __NR_ia32_sysfs		135
-#define __NR_ia32_personality	136
-#define __NR_ia32_afs_syscall	137 /* Syscall for Andrew File System */
-#define __NR_ia32_setfsuid		138
-#define __NR_ia32_setfsgid		139
-#define __NR_ia32__llseek		140
-#define __NR_ia32_getdents		141
-#define __NR_ia32__newselect		142
-#define __NR_ia32_flock		143
-#define __NR_ia32_msync		144
-#define __NR_ia32_readv		145
-#define __NR_ia32_writev		146
-#define __NR_ia32_getsid		147
-#define __NR_ia32_fdatasync		148
-#define __NR_ia32__sysctl		149
-#define __NR_ia32_mlock		150
-#define __NR_ia32_munlock		151
-#define __NR_ia32_mlockall		152
-#define __NR_ia32_munlockall		153
-#define __NR_ia32_sched_setparam		154
-#define __NR_ia32_sched_getparam		155
-#define __NR_ia32_sched_setscheduler		156
-#define __NR_ia32_sched_getscheduler		157
-#define __NR_ia32_sched_yield		158
-#define __NR_ia32_sched_get_priority_max	159
-#define __NR_ia32_sched_get_priority_min	160
-#define __NR_ia32_sched_rr_get_interval	161
-#define __NR_ia32_nanosleep		162
-#define __NR_ia32_mremap		163
-#define __NR_ia32_setresuid		164
-#define __NR_ia32_getresuid		165
-#define __NR_ia32_vm86		166
-#define __NR_ia32_query_module	167
-#define __NR_ia32_poll		168
-#define __NR_ia32_nfsservctl		169
-#define __NR_ia32_setresgid		170
-#define __NR_ia32_getresgid		171
-#define __NR_ia32_prctl              172
+#define __NR_ia32_sigreturn	119
 #define __NR_ia32_rt_sigreturn	173
-#define __NR_ia32_rt_sigaction	174
-#define __NR_ia32_rt_sigprocmask	175
-#define __NR_ia32_rt_sigpending	176
-#define __NR_ia32_rt_sigtimedwait	177
-#define __NR_ia32_rt_sigqueueinfo	178
-#define __NR_ia32_rt_sigsuspend	179
-#define __NR_ia32_pread		180
-#define __NR_ia32_pwrite		181
-#define __NR_ia32_chown		182
-#define __NR_ia32_getcwd		183
-#define __NR_ia32_capget		184
-#define __NR_ia32_capset		185
-#define __NR_ia32_sigaltstack	186
-#define __NR_ia32_sendfile		187
-#define __NR_ia32_getpmsg		188	/* some people actually want streams */
-#define __NR_ia32_putpmsg		189	/* some people actually want streams */
-#define __NR_ia32_vfork		190
-#define __NR_ia32_ugetrlimit		191	/* SuS compliant getrlimit */
-#define __NR_ia32_mmap2		192
-#define __NR_ia32_truncate64		193
-#define __NR_ia32_ftruncate64	194
-#define __NR_ia32_stat64		195
-#define __NR_ia32_lstat64		196
-#define __NR_ia32_fstat64		197
-#define __NR_ia32_lchown32		198
-#define __NR_ia32_getuid32		199
-#define __NR_ia32_getgid32		200
-#define __NR_ia32_geteuid32		201
-#define __NR_ia32_getegid32		202
-#define __NR_ia32_setreuid32		203
-#define __NR_ia32_setregid32		204
-#define __NR_ia32_getgroups32	205
-#define __NR_ia32_setgroups32	206
-#define __NR_ia32_fchown32		207
-#define __NR_ia32_setresuid32	208
-#define __NR_ia32_getresuid32	209
-#define __NR_ia32_setresgid32	210
-#define __NR_ia32_getresgid32	211
-#define __NR_ia32_chown32		212
-#define __NR_ia32_setuid32		213
-#define __NR_ia32_setgid32		214
-#define __NR_ia32_setfsuid32		215
-#define __NR_ia32_setfsgid32		216
-#define __NR_ia32_pivot_root		217
-#define __NR_ia32_mincore		218
-#define __NR_ia32_madvise		219
-#define __NR_ia32_madvise1		219	/* delete when C lib stub is removed */
-#define __NR_ia32_getdents64		220
-#define __NR_ia32_fcntl64		221
-#define __NR_ia32_tuxcall		222
-#define __NR_ia32_security		223
-#define __NR_ia32_gettid		224
-#define __NR_ia32_readahead		225
-#define __NR_ia32_setxattr		226
-#define __NR_ia32_lsetxattr		227
-#define __NR_ia32_fsetxattr		228
-#define __NR_ia32_getxattr		229
-#define __NR_ia32_lgetxattr		230
-#define __NR_ia32_fgetxattr		231
-#define __NR_ia32_listxattr		232
-#define __NR_ia32_llistxattr		233
-#define __NR_ia32_flistxattr		234
-#define __NR_ia32_removexattr	235
-#define __NR_ia32_lremovexattr	236
-#define __NR_ia32_fremovexattr	237
-#define __NR_ia32_tkill		238
-#define __NR_ia32_sendfile64		239
-#define __NR_ia32_futex		240
-#define __NR_ia32_sched_setaffinity	241
-#define __NR_ia32_sched_getaffinity	242
-#define __NR_ia32_set_thread_area   243
-#define __NR_ia32_get_thread_area	244
-#define __NR_ia32_io_setup		245
-#define __NR_ia32_io_destroy		246
-#define __NR_ia32_io_getevents	247
-#define __NR_ia32_io_submit		248
-#define __NR_ia32_io_cancel		249
-#define __NR_ia32_exit_group		252
-#define __NR_ia32_lookup_dcookie	253
-#define __NR_ia32_sys_epoll_create	254
-#define __NR_ia32_sys_epoll_ctl	255
-#define __NR_ia32_sys_epoll_wait	256
-#define __NR_ia32_remap_file_pages	257
-#define __NR_ia32_set_tid_address	258
-#define __NR_ia32_timer_create		259
-#define __NR_ia32_timer_settime	(__NR_ia32_timer_create+1)
-#define __NR_ia32_timer_gettime	(__NR_ia32_timer_create+2)
-#define __NR_ia32_timer_getoverrun	(__NR_ia32_timer_create+3)
-#define __NR_ia32_timer_delete	(__NR_ia32_timer_create+4)
-#define __NR_ia32_clock_settime	(__NR_ia32_timer_create+5)
-#define __NR_ia32_clock_gettime	(__NR_ia32_timer_create+6)
-#define __NR_ia32_clock_getres	(__NR_ia32_timer_create+7)
-#define __NR_ia32_clock_nanosleep	(__NR_ia32_timer_create+8)
-#define __NR_ia32_statfs64		268
-#define __NR_ia32_fstatfs64		269
-#define __NR_ia32_tgkill		270
-#define __NR_ia32_utimes		271
-#define __NR_ia32_fadvise64_64		272
-#define __NR_ia32_vserver		273
-#define __NR_ia32_mbind		274
-#define __NR_ia32_get_mempolicy	275
-#define __NR_ia32_set_mempolicy	276
-#define __NR_ia32_mq_open 		277
-#define __NR_ia32_mq_unlink		(__NR_ia32_mq_open+1)
-#define __NR_ia32_mq_timedsend	(__NR_ia32_mq_open+2)
-#define __NR_ia32_mq_timedreceive	(__NR_ia32_mq_open+3)
-#define __NR_ia32_mq_notify		(__NR_ia32_mq_open+4)
-#define __NR_ia32_mq_getsetattr	(__NR_ia32_mq_open+5)
-#define __NR_ia32_kexec		283
-#define __NR_ia32_waitid		284
-/* #define __NR_sys_setaltroot	285 */
-#define __NR_ia32_add_key		286
-#define __NR_ia32_request_key	287
-#define __NR_ia32_keyctl		288
-#define __NR_ia32_ioprio_set		289
-#define __NR_ia32_ioprio_get		290
-#define __NR_ia32_inotify_init		291
-#define __NR_ia32_inotify_add_watch	292
-#define __NR_ia32_inotify_rm_watch	293
-#define __NR_ia32_migrate_pages		294
-#define __NR_ia32_openat		295
-#define __NR_ia32_mkdirat		296
-#define __NR_ia32_mknodat		297
-#define __NR_ia32_fchownat		298
-#define __NR_ia32_futimesat		299
-#define __NR_ia32_fstatat64		300
-#define __NR_ia32_unlinkat		301
-#define __NR_ia32_renameat		302
-#define __NR_ia32_linkat		303
-#define __NR_ia32_symlinkat		304
-#define __NR_ia32_readlinkat		305
-#define __NR_ia32_fchmodat		306
-#define __NR_ia32_faccessat		307
-#define __NR_ia32_pselect6		308
-#define __NR_ia32_ppoll			309
-#define __NR_ia32_unshare		310
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
-- 
cgit v1.2.3


From ed0a893fc8866baf116323acbcd883a3cc4a36a3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:28 +0200
Subject: [PATCH] x86_64: Remove bogus comment in topology.h

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/topology.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 80c4e44d011c..d4009f8dade0 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -7,8 +7,6 @@
 #include <asm/mpspec.h>
 #include <asm/bitops.h>
 
-/* Map the K8 CPU local memory controllers to a simple 1:1 CPU:NODE topology */
-
 extern cpumask_t cpu_online_map;
 
 extern unsigned char cpu_to_node[];
-- 
cgit v1.2.3


From a32073bffc656ca4bde6002b6cf7c1a8e0e22712 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:40 +0200
Subject: [PATCH] x86_64: Clean and enhance up K8 northbridge access code

 - Factor out the duplicated access/cache code into a single file
   * Shared between i386/x86-64.
 - Share flush code between AGP and IOMMU
   * Fix a bug: AGP didn't wait for end of flush before
 - Drop 8 northbridges limit and allocate dynamically
 - Add lock to serialize AGP and IOMMU GART flushes
 - Add PCI ID for next AMD northbridge
 - Random related cleanups

The old K8 NUMA discovery code is unchanged. New systems
should all use SRAT for this.

Cc: "Navin Boppuri" <navin.boppuri@newisys.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig             |   4 ++
 arch/i386/kernel/Makefile     |   4 ++
 arch/x86_64/Kconfig           |   4 ++
 arch/x86_64/kernel/Makefile   |   1 +
 arch/x86_64/kernel/aperture.c |  24 ++++-----
 arch/x86_64/kernel/k8.c       | 118 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-gart.c |  93 ++++++++-------------------------
 arch/x86_64/pci/k8-bus.c      |  10 ++--
 drivers/char/agp/amd64-agp.c  |  77 +++++++++------------------
 include/asm-i386/k8.h         |   1 +
 include/asm-x86_64/k8.h       |  14 +++++
 11 files changed, 209 insertions(+), 141 deletions(-)
 create mode 100644 arch/x86_64/kernel/k8.c
 create mode 100644 include/asm-i386/k8.h
 create mode 100644 include/asm-x86_64/k8.h

(limited to 'include')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 1596101cfaf8..2206bf6637de 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1054,6 +1054,10 @@ config SCx200
 	  This support is also available as a module.  If compiled as a
 	  module, it will be called scx200.
 
+config K8_NB
+	def_bool y
+	depends on AGP_AMD64
+
 source "drivers/pcmcia/Kconfig"
 
 source "drivers/pci/hotplug/Kconfig"
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 96fb8a020af2..28b14d6c7b1a 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
 obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_K8_NB)		+= k8.o
 
 EXTRA_AFLAGS   := -traditional
 
@@ -76,3 +77,6 @@ SYSCFLAGS_vsyscall-syms.o = -r
 $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
 			$(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
 	$(call if_changed,syscall)
+
+k8-y                      += ../../x86_64/kernel/k8.o
+
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index af44130f0d65..fc75275d8c72 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -501,6 +501,10 @@ config REORDER
          optimal TLB usage. If you have pretty much any version of binutils, 
 	 this can increase your kernel build time by roughly one minute.
 
+config K8_NB
+	def_bool y
+	depends on AGP_AMD64 || GART_IOMMU || (PCI && NUMA)
+
 endmenu
 
 #
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 381bc6ad743e..f927d11065fe 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 obj-$(CONFIG_X86_VSMP)		+= vsmp.o
+obj-$(CONFIG_K8_NB)		+= k8.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 70b9d21ed675..a7ad03ee98cf 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -24,6 +24,7 @@
 #include <asm/proto.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
+#include <asm/k8.h>
 
 int iommu_aperture;
 int iommu_aperture_disabled __initdata = 0;
@@ -37,8 +38,6 @@ int fix_aperture __initdata = 1;
 /* This code runs before the PCI subsystem is initialized, so just
    access the northbridge directly. */
 
-#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
-
 static u32 __init allocate_aperture(void) 
 {
 	pg_data_t *nd0 = NODE_DATA(0);
@@ -68,20 +67,20 @@ static u32 __init allocate_aperture(void)
 	return (u32)__pa(p); 
 }
 
-static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) 
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
 { 
 	if (!aper_base) 
 		return 0;
 	if (aper_size < 64*1024*1024) { 
-		printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); 
+		printk("Aperture too small (%d MB)\n", aper_size>>20);
 		return 0;
 	}
 	if (aper_base + aper_size >= 0xffffffff) { 
-		printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+		printk("Aperture beyond 4GB. Ignoring.\n");
 		return 0; 
 	}
 	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+		printk("Aperture pointing to e820 RAM. Ignoring.\n");
 		return 0; 
 	} 
 	return 1;
@@ -140,7 +139,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
 	       aper, 32 << *order, apsizereg);
 
-	if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+	if (!aperture_valid(aper, (32*1024*1024) << *order))
 	    return 0;
 	return (u32)aper; 
 } 
@@ -208,9 +207,8 @@ void __init iommu_hole_init(void)
 
 	fix = 0;
 	for (num = 24; num < 32; num++) {		
-		char name[30];
-		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
-			continue;	
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
 
 		iommu_aperture = 1; 
 
@@ -222,9 +220,7 @@ void __init iommu_hole_init(void)
 		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
 		       aper_base, aper_size>>20);
 		
-		sprintf(name, "northbridge cpu %d", num-24); 
-
-		if (!aperture_valid(name, aper_base, aper_size)) { 
+		if (!aperture_valid(aper_base, aper_size)) {
 			fix = 1; 
 			break; 
 		}
@@ -273,7 +269,7 @@ void __init iommu_hole_init(void)
 
 	/* Fix up the north bridges */
 	for (num = 24; num < 32; num++) { 		
-		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;	
 
 		/* Don't enable translation yet. That is done later. 
diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c
new file mode 100644
index 000000000000..6416682d33d0
--- /dev/null
+++ b/arch/x86_64/kernel/k8.c
@@ -0,0 +1,118 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/k8.h>
+
+int num_k8_northbridges;
+EXPORT_SYMBOL(num_k8_northbridges);
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+	{}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct pci_dev **k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+	do {
+		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+		if (!dev)
+			break;
+	} while (!pci_match_id(&k8_nb_ids[0], dev));
+	return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+	int i;
+	struct pci_dev *dev;
+	if (num_k8_northbridges)
+		return 0;
+
+	num_k8_northbridges = 0;
+	dev = NULL;
+	while ((dev = next_k8_northbridge(dev)) != NULL)
+		num_k8_northbridges++;
+
+	k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
+				  GFP_KERNEL);
+	if (!k8_northbridges)
+		return -ENOMEM;
+
+	flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
+	if (!flush_words) {
+		kfree(k8_northbridges);
+		return -ENOMEM;
+	}
+
+	dev = NULL;
+	i = 0;
+	while ((dev = next_k8_northbridge(dev)) != NULL) {
+		k8_northbridges[i++] = dev;
+		pci_read_config_dword(dev, 0x9c, &flush_words[i]);
+	}
+	k8_northbridges[i] = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+	struct pci_device_id *id;
+	u32 vendor = device & 0xffff;
+	device >>= 16;
+	for (id = k8_nb_ids; id->vendor; id++)
+		if (vendor == id->vendor && device == id->device)
+			return 1;
+	return 0;
+}
+
+void k8_flush_garts(void)
+{
+	int flushed, i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(gart_lock);
+
+	/* Avoid races between AGP and IOMMU. In theory it's not needed
+	   but I'm not sure if the hardware won't lose flush requests
+	   when another is pending. This whole thing is so expensive anyways
+	   that it doesn't matter to serialize more. -AK */
+	spin_lock_irqsave(&gart_lock, flags);
+	flushed = 0;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		pci_write_config_dword(k8_northbridges[i], 0x9c,
+				       flush_words[i]|1);
+		flushed++;
+	}
+	for (i = 0; i < num_k8_northbridges; i++) {
+		u32 w;
+		/* Make sure the hardware actually executed the flush*/
+		for (;;) {
+			pci_read_config_dword(k8_northbridges[i],
+					      0x9c, &w);
+			if (!(w & 1))
+				break;
+			cpu_relax();
+		}
+	}
+	spin_unlock_irqrestore(&gart_lock, flags);
+	if (!flushed)
+		printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index ea8f4041794e..ded3af3bceec 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -32,6 +32,7 @@
 #include <asm/kdebug.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
+#include <asm/k8.h>
 
 unsigned long iommu_bus_base;	/* GART remapping area (physical) */
 static unsigned long iommu_size; 	/* size of remapping area bytes */
@@ -46,8 +47,6 @@ u32 *iommu_gatt_base; 		/* Remapping table */
    also seen with Qlogic at least). */
 int iommu_fullflush = 1;
 
-#define MAX_NB 8
-
 /* Allocation bitmap for the remapping area */ 
 static DEFINE_SPINLOCK(iommu_bitmap_lock);
 static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
@@ -63,13 +62,6 @@ static u32 gart_unmapped_entry;
 #define to_pages(addr,size) \
 	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 
-#define for_all_nb(dev) \
-	dev = NULL;	\
-	while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)
-
-static struct pci_dev *northbridges[MAX_NB];
-static u32 northbridge_flush_word[MAX_NB];
-
 #define EMERGENCY_PAGES 32 /* = 128KB */ 
 
 #ifdef CONFIG_AGP
@@ -120,44 +112,17 @@ static void free_iommu(unsigned long offset, int size)
 /* 
  * Use global flush state to avoid races with multiple flushers.
  */
-static void flush_gart(struct device *dev)
+static void flush_gart(void)
 { 
 	unsigned long flags;
-	int flushed = 0;
-	int i, max;
-
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	if (need_flush) { 
-		max = 0;
-		for (i = 0; i < MAX_NB; i++) {
-			if (!northbridges[i]) 
-				continue;
-			pci_write_config_dword(northbridges[i], 0x9c, 
-					       northbridge_flush_word[i] | 1); 
-			flushed++;
-			max = i;
-		}
-		for (i = 0; i <= max; i++) {
-			u32 w;
-			if (!northbridges[i])
-				continue;
-			/* Make sure the hardware actually executed the flush. */
-			for (;;) { 
-				pci_read_config_dword(northbridges[i], 0x9c, &w);
-				if (!(w & 1))
-					break;
-				cpu_relax();
-			}
-		} 
-		if (!flushed) 
-			printk("nothing to flush?\n");
+	if (need_flush) {
+		k8_flush_garts();
 		need_flush = 0;
 	} 
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 } 
 
-
-
 #ifdef CONFIG_IOMMU_LEAK
 
 #define SET_LEAK(x) if (iommu_leak_tab) \
@@ -266,7 +231,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf,
 				 size_t size, int dir)
 {
 	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
-	flush_gart(dev);
+	flush_gart();
 	return map;
 }
 
@@ -351,7 +316,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 		s->dma_address = addr;
 		s->dma_length = s->length;
 	}
-	flush_gart(dev);
+	flush_gart();
 	return nents;
 }
 
@@ -458,13 +423,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
 		goto error;
 	out++;
-	flush_gart(dev);
+	flush_gart();
 	if (out < nents) 
 		sg[out].dma_length = 0; 
 	return out;
 
 error:
-	flush_gart(NULL);
+	flush_gart();
 	gart_unmap_sg(dev, sg, nents, dir);
 	/* When it was forced or merged try again in a dumb way */
 	if (force_iommu || iommu_merge) {
@@ -532,10 +497,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	void *gatt;
 	unsigned aper_base, new_aper_base;
 	unsigned aper_size, gatt_size, new_aper_size;
-	
+	int i;
+
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
-	for_all_nb(dev) { 
+	dev = NULL;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		dev = k8_northbridges[i];
 		new_aper_base = read_aperture(dev, &new_aper_size); 
 		if (!new_aper_base) 
 			goto nommu; 
@@ -558,11 +526,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 		panic("Cannot allocate GATT table"); 
 	memset(gatt, 0, gatt_size); 
 	agp_gatt_table = gatt;
-	
-	for_all_nb(dev) { 
+
+	for (i = 0; i < num_k8_northbridges; i++) {
 		u32 ctl; 
 		u32 gatt_reg; 
 
+		dev = k8_northbridges[i];
 		gatt_reg = __pa(gatt) >> 12; 
 		gatt_reg <<= 4; 
 		pci_write_config_dword(dev, 0x98, gatt_reg);
@@ -573,7 +542,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 		pci_write_config_dword(dev, 0x90, ctl); 
 	}
-	flush_gart(NULL); 
+	flush_gart();
 	
 	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
 	return 0;
@@ -607,10 +576,14 @@ static int __init pci_iommu_init(void)
 	struct agp_kern_info info;
 	unsigned long aper_size;
 	unsigned long iommu_start;
-	struct pci_dev *dev;
 	unsigned long scratch;
 	long i;
 
+	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+		return -1;
+	}
+
 #ifndef CONFIG_AGP_AMD64
 	no_agp = 1; 
 #else
@@ -637,14 +610,6 @@ static int __init pci_iommu_init(void)
 		return -1;
 	}
 
-	i = 0;
-	for_all_nb(dev)
-		i++;
-	if (i > MAX_NB) {
-		printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i);
-		return -1;
-	}
-
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
 	aper_size = info.aper_size * 1024 * 1024;	
 	iommu_size = check_iommu_size(info.aper_base, aper_size); 
@@ -707,20 +672,8 @@ static int __init pci_iommu_init(void)
 	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
 		iommu_gatt_base[i] = gart_unmapped_entry;
 
-	for_all_nb(dev) {
-		u32 flag; 
-		int cpu = PCI_SLOT(dev->devfn) - 24;
-		if (cpu >= MAX_NB)
-			continue;
-		northbridges[cpu] = dev;
-		pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
-		northbridge_flush_word[cpu] = flag; 
-	}
-		     
-	flush_gart(NULL);
-
+	flush_gart();
 	dma_ops = &gart_dma_ops;
-
 	return 0;
 } 
 
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
index 3acf60ded2a0..b50a7c7c47f8 100644
--- a/arch/x86_64/pci/k8-bus.c
+++ b/arch/x86_64/pci/k8-bus.c
@@ -2,6 +2,7 @@
 #include <linux/pci.h>
 #include <asm/mpspec.h>
 #include <linux/cpumask.h>
+#include <asm/k8.h>
 
 /*
  * This discovers the pcibus <-> node mapping on AMD K8.
@@ -18,7 +19,6 @@
 #define NR_LDT_BUS_NUMBER_REGISTERS 3
 #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)
 #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
-#define PCI_DEVICE_ID_K8HTCONFIG 0x1100
 
 /**
  * fill_mp_bus_to_cpumask()
@@ -28,8 +28,7 @@
 __init static int
 fill_mp_bus_to_cpumask(void)
 {
-	struct pci_dev *nb_dev = NULL;
-	int i, j;
+	int i, j, k;
 	u32 ldtbus, nid;
 	static int lbnr[3] = {
 		LDT_BUS_NUMBER_REGISTER_0,
@@ -37,8 +36,9 @@ fill_mp_bus_to_cpumask(void)
 		LDT_BUS_NUMBER_REGISTER_2
 	};
 
-	while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
-			PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
+	cache_k8_northbridges();
+	for (k = 0; k < num_k8_northbridges; k++) {
+		struct pci_dev *nb_dev = k8_northbridges[k];
 		pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
 
 		for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index ac3c33a2e37d..229d015757f9 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -15,11 +15,9 @@
 #include <linux/agp_backend.h>
 #include <linux/mmzone.h>
 #include <asm/page.h>		/* PAGE_SIZE */
+#include <asm/k8.h>
 #include "agp.h"
 
-/* Will need to be increased if AMD64 ever goes >8-way. */
-#define MAX_HAMMER_GARTS   8
-
 /* PTE bits. */
 #define GPTE_VALID	1
 #define GPTE_COHERENT	2
@@ -53,28 +51,12 @@
 #define ULI_X86_64_HTT_FEA_REG		0x50
 #define ULI_X86_64_ENU_SCR_REG		0x54
 
-static int nr_garts;
-static struct pci_dev * hammers[MAX_HAMMER_GARTS];
-
 static struct resource *aperture_resource;
 static int __initdata agp_try_unsupported = 1;
 
-#define for_each_nb() for(gart_iterator=0;gart_iterator<nr_garts;gart_iterator++)
-
-static void flush_amd64_tlb(struct pci_dev *dev)
-{
-	u32 tmp;
-
-	pci_read_config_dword (dev, AMD64_GARTCACHECTL, &tmp);
-	tmp |= INVGART;
-	pci_write_config_dword (dev, AMD64_GARTCACHECTL, tmp);
-}
-
 static void amd64_tlbflush(struct agp_memory *temp)
 {
-	int gart_iterator;
-	for_each_nb()
-		flush_amd64_tlb(hammers[gart_iterator]);
+	k8_flush_garts();
 }
 
 static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -153,7 +135,7 @@ static int amd64_fetch_size(void)
 	u32 temp;
 	struct aper_size_info_32 *values;
 
-	dev = hammers[0];
+	dev = k8_northbridges[0];
 	if (dev==NULL)
 		return 0;
 
@@ -201,9 +183,6 @@ static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table)
 	tmp &= ~(DISGARTCPU | DISGARTIO);
 	pci_write_config_dword(hammer, AMD64_GARTAPERTURECTL, tmp);
 
-	/* keep CPU's coherent. */
-	flush_amd64_tlb (hammer);
-
 	return aper_base;
 }
 
@@ -222,13 +201,14 @@ static struct aper_size_info_32 amd_8151_sizes[7] =
 static int amd_8151_configure(void)
 {
 	unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
-	int gart_iterator;
+	int i;
 
 	/* Configure AGP regs in each x86-64 host bridge. */
-	for_each_nb() {
+        for (i = 0; i < num_k8_northbridges; i++) {
 		agp_bridge->gart_bus_addr =
-				amd64_configure(hammers[gart_iterator],gatt_bus);
+				amd64_configure(k8_northbridges[i], gatt_bus);
 	}
+	k8_flush_garts();
 	return 0;
 }
 
@@ -236,12 +216,13 @@ static int amd_8151_configure(void)
 static void amd64_cleanup(void)
 {
 	u32 tmp;
-	int gart_iterator;
-	for_each_nb() {
+	int i;
+        for (i = 0; i < num_k8_northbridges; i++) {
+		struct pci_dev *dev = k8_northbridges[i];
 		/* disable gart translation */
-		pci_read_config_dword (hammers[gart_iterator], AMD64_GARTAPERTURECTL, &tmp);
+		pci_read_config_dword (dev, AMD64_GARTAPERTURECTL, &tmp);
 		tmp &= ~AMD64_GARTEN;
-		pci_write_config_dword (hammers[gart_iterator], AMD64_GARTAPERTURECTL, tmp);
+		pci_write_config_dword (dev, AMD64_GARTAPERTURECTL, tmp);
 	}
 }
 
@@ -361,17 +342,15 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 
 static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr)
 {
-	struct pci_dev *loop_dev = NULL;
-	int i = 0;
-
-	/* cache pci_devs of northbridges. */
-	while ((loop_dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, loop_dev))
-			!= NULL) {
-		if (i == MAX_HAMMER_GARTS) {
-			printk(KERN_ERR PFX "Too many northbridges for AGP\n");
-			return -1;
-		}
-		if (fix_northbridge(loop_dev, pdev, cap_ptr) < 0) {
+	int i;
+
+	if (cache_k8_northbridges() < 0)
+		return -ENODEV;
+
+	i = 0;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		struct pci_dev *dev = k8_northbridges[i];
+		if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
 			printk(KERN_ERR PFX "No usable aperture found.\n");
 #ifdef __x86_64__
 			/* should port this to i386 */
@@ -379,10 +358,8 @@ static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr)
 #endif
 			return -1;
 		}
-		hammers[i++] = loop_dev;
 	}
-		nr_garts = i;
-	return i == 0 ? -1 : 0;
+	return 0;
 }
 
 /* Handle AMD 8151 quirks */
@@ -450,7 +427,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
 	}
 
 	/* shadow x86-64 registers into ULi registers */
-	pci_read_config_dword (hammers[0], AMD64_GARTAPERTUREBASE, &httfea);
+	pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &httfea);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
 	if ((httfea & 0x7fff) >> (32 - 25))
@@ -513,7 +490,7 @@ static int __devinit nforce3_agp_init(struct pci_dev *pdev)
 	pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
 
 	/* shadow x86-64 registers into NVIDIA registers */
-	pci_read_config_dword (hammers[0], AMD64_GARTAPERTUREBASE, &apbase);
+	pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &apbase);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
 	if ( (apbase & 0x7fff) >> (32 - 25) ) {
@@ -754,10 +731,6 @@ static struct pci_driver agp_amd64_pci_driver = {
 int __init agp_amd64_init(void)
 {
 	int err = 0;
-	static struct pci_device_id amd64nb[] = {
-		{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-		{ },
-	};
 
 	if (agp_off)
 		return -EINVAL;
@@ -774,7 +747,7 @@ int __init agp_amd64_init(void)
 		}
 
 		/* First check that we have at least one AMD64 NB */
-		if (!pci_dev_present(amd64nb))
+		if (!pci_dev_present(k8_nb_ids))
 			return -ENODEV;
 
 		/* Look for any AGP bridge */
diff --git a/include/asm-i386/k8.h b/include/asm-i386/k8.h
new file mode 100644
index 000000000000..dfd88a6e6040
--- /dev/null
+++ b/include/asm-i386/k8.h
@@ -0,0 +1 @@
+#include <asm-x86_64/k8.h>
diff --git a/include/asm-x86_64/k8.h b/include/asm-x86_64/k8.h
new file mode 100644
index 000000000000..699dd6961eda
--- /dev/null
+++ b/include/asm-x86_64/k8.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_K8_H
+#define _ASM_K8_H 1
+
+#include <linux/pci.h>
+
+extern struct pci_device_id k8_nb_ids[];
+
+extern int early_is_k8_nb(u32 value);
+extern struct pci_dev **k8_northbridges;
+extern int num_k8_northbridges;
+extern int cache_k8_northbridges(void);
+extern void k8_flush_garts(void);
+
+#endif
-- 
cgit v1.2.3


From bebfa1013eee1d91b3242e5801cc8fbdfaf148ec Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:52 +0200
Subject: [PATCH] x86_64: Add compat_printk and sysctl to turn off compat layer
 warnings

Sometimes e.g. with crashme the compat layer warnings can be noisy.
Add a way to turn them off by gating all output through compat_printk
that checks a global sysctl. The default is not changed.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/sys_ia32.c |  4 ++--
 fs/compat.c                 | 16 +++++++++++++++-
 include/linux/compat.h      |  2 ++
 include/linux/sysctl.h      |  2 ++
 kernel/sysctl.c             | 11 +++++++++++
 5 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index f182b20858e2..ee30557629dc 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -514,7 +514,7 @@ int sys32_ni_syscall(int call)
 	static char lastcomm[sizeof(me->comm)];
 
 	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
+		compat_printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
 		       call, me->comm);
 		strncpy(lastcomm, me->comm, sizeof(lastcomm));
 	} 
@@ -916,7 +916,7 @@ long sys32_vm86_warning(void)
 	struct task_struct *me = current;
 	static char lastcomm[sizeof(me->comm)];
 	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+		compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
 		       me->comm);
 		strncpy(lastcomm, me->comm, sizeof(lastcomm));
 	} 
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3cf..e31e9cf96647 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
-	printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
 			"cmd(%08x){%s} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
diff --git a/include/linux/compat.h b/include/linux/compat.h
index dda1697ec753..9760753e662b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -226,5 +226,7 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
+extern int compat_printk(const char *fmt, ...);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6a60770984e9..349ef908a222 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -148,9 +148,11 @@ enum
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
 };
 
 
+
 /* CTL_VM names: */
 enum
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2c0e65819448..f1a4eb1a655e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -73,6 +73,7 @@ extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
+extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -676,6 +677,16 @@ static ctl_table kern_table[] = {
 	 	.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
+#ifdef CONFIG_COMPAT
+	{
+		.ctl_name	= KERN_COMPAT_LOG,
+		.procname	= "compat-log",
+		.data		= &compat_log,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From 3e4ff115740c28dea463561aa1405a3c0de0d2d0 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 26 Jun 2006 13:57:01 +0200
Subject: [PATCH] x86_64: nmi watchdog header cleanup

Misc header cleanup for nmi watchdog.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apic.c              |  1 +
 arch/i386/kernel/io_apic.c           |  1 +
 arch/i386/kernel/nmi.c               |  7 +------
 arch/i386/kernel/smpboot.c           |  1 +
 arch/i386/oprofile/op_model_athlon.c |  1 +
 arch/i386/oprofile/op_model_p4.c     |  1 +
 arch/i386/oprofile/op_model_ppro.c   |  1 +
 arch/x86_64/kernel/io_apic.c         |  1 +
 arch/x86_64/kernel/nmi.c             |  8 --------
 include/asm-i386/apic.h              | 12 ------------
 include/asm-i386/nmi.h               | 28 +++++++++++++++++++++-------
 include/asm-x86_64/apic.h            | 16 ----------------
 include/asm-x86_64/nmi.h             | 30 ++++++++++++++++++++++++------
 13 files changed, 53 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 5ab59c12335b..5f197e197c55 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -36,6 +36,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/hpet.h>
 #include <asm/i8253.h>
+#include <asm/nmi.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 5a1a5412015c..61317f4be444 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -38,6 +38,7 @@
 #include <asm/desc.h>
 #include <asm/timer.h>
 #include <asm/i8259.h>
+#include <asm/nmi.h>
 
 #include <mach_apic.h>
 
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d43b498ec745..bd3875419630 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -14,20 +14,15 @@
  */
 
 #include <linux/config.h>
-#include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/sysdev.h>
 #include <linux/sysctl.h>
+#include <linux/percpu.h>
 
 #include <asm/smp.h>
-#include <asm/div64.h>
 #include <asm/nmi.h>
 
 #include "mach_traps.h"
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index bd0ca5c9f053..bce5470ecb42 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -52,6 +52,7 @@
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
+#include <asm/nmi.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
diff --git a/arch/i386/oprofile/op_model_athlon.c b/arch/i386/oprofile/op_model_athlon.c
index 3ad9a72a5036..693bdea4a52b 100644
--- a/arch/i386/oprofile/op_model_athlon.c
+++ b/arch/i386/oprofile/op_model_athlon.c
@@ -13,6 +13,7 @@
 #include <linux/oprofile.h>
 #include <asm/ptrace.h>
 #include <asm/msr.h>
+#include <asm/nmi.h>
  
 #include "op_x86_model.h"
 #include "op_counter.h"
diff --git a/arch/i386/oprofile/op_model_p4.c b/arch/i386/oprofile/op_model_p4.c
index ac8a066035c2..7c61d357b82b 100644
--- a/arch/i386/oprofile/op_model_p4.c
+++ b/arch/i386/oprofile/op_model_p4.c
@@ -14,6 +14,7 @@
 #include <asm/ptrace.h>
 #include <asm/fixmap.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c
index d719015fc044..5c3ab4b027ad 100644
--- a/arch/i386/oprofile/op_model_ppro.c
+++ b/arch/i386/oprofile/op_model_ppro.c
@@ -14,6 +14,7 @@
 #include <asm/ptrace.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
  
 #include "op_x86_model.h"
 #include "op_counter.h"
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 15b2f6932de1..38a3ff30bde1 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -41,6 +41,7 @@
 #include <asm/mach_apic.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
+#include <asm/nmi.h>
 
 #define __apicdebuginit  __init
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4e6357fe0ec3..ab421e22fa67 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -15,11 +15,7 @@
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
 #include <linux/nmi.h>
@@ -27,13 +23,9 @@
 #include <linux/kprobes.h>
 
 #include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
 #include <asm/nmi.h>
-#include <asm/msr.h>
 #include <asm/proto.h>
 #include <asm/kdebug.h>
-#include <asm/local.h>
 #include <asm/mce.h>
 
 /*
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index 1d8362cb2c5d..2c1e371cebb6 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -111,24 +111,12 @@ extern void init_apic_mappings (void);
 extern void smp_local_timer_interrupt (struct pt_regs * regs);
 extern void setup_boot_APIC_clock (void);
 extern void setup_secondary_APIC_clock (void);
-extern void setup_apic_nmi_watchdog (void);
-extern int reserve_lapic_nmi(void);
-extern void release_lapic_nmi(void);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern void nmi_watchdog_tick (struct pt_regs * regs);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 
 extern void enable_NMI_through_LVT0 (void * dummy);
 
-extern unsigned int nmi_watchdog;
-#define NMI_NONE	0
-#define NMI_IO_APIC	1
-#define NMI_LOCAL_APIC	2
-#define NMI_INVALID	3
-
 extern int disable_timer_pin_1;
 
 void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index 21f16638fc61..67d994799999 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -5,24 +5,38 @@
 #define ASM_NMI_H
 
 #include <linux/pm.h>
- 
+
 struct pt_regs;
- 
+
 typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
- 
-/** 
+
+/**
  * set_nmi_callback
  *
  * Set a handler for an NMI. Only one handler may be
  * set. Return 1 if the NMI was handled.
  */
 void set_nmi_callback(nmi_callback_t callback);
- 
-/** 
+
+/**
  * unset_nmi_callback
  *
  * Remove the handler previously set.
  */
 void unset_nmi_callback(void);
- 
+
+extern void setup_apic_nmi_watchdog (void);
+extern int reserve_lapic_nmi(void);
+extern void release_lapic_nmi(void);
+extern void disable_timer_nmi_watchdog(void);
+extern void enable_timer_nmi_watchdog(void);
+extern void nmi_watchdog_tick (struct pt_regs * regs);
+
+extern unsigned int nmi_watchdog;
+#define NMI_DEFAULT     -1
+#define NMI_NONE	0
+#define NMI_IO_APIC	1
+#define NMI_LOCAL_APIC	2
+#define NMI_INVALID	3
+
 #endif /* ASM_NMI_H */
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index a731be2204d2..c9e6c2501a48 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -79,27 +79,11 @@ extern void init_apic_mappings (void);
 extern void smp_local_timer_interrupt (struct pt_regs * regs);
 extern void setup_boot_APIC_clock (void);
 extern void setup_secondary_APIC_clock (void);
-extern void setup_apic_nmi_watchdog (void);
-extern int reserve_lapic_nmi(void);
-extern void release_lapic_nmi(void);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 extern void clustered_apic_check(void);
 
-extern void nmi_watchdog_default(void);
-extern int setup_nmi_watchdog(char *);
-
-extern unsigned int nmi_watchdog;
-#define NMI_DEFAULT	-1
-#define NMI_NONE	0
-#define NMI_IO_APIC	1
-#define NMI_LOCAL_APIC	2
-#define NMI_INVALID	3
-
 extern int disable_timer_pin_1;
 
 extern void setup_threshold_lvt(unsigned long lvt_off);
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index d3abfc6a8fd5..efb45c894d76 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -5,26 +5,27 @@
 #define ASM_NMI_H
 
 #include <linux/pm.h>
+#include <asm/io.h>
  
 struct pt_regs;
- 
+
 typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
- 
-/** 
+
+/**
  * set_nmi_callback
  *
  * Set a handler for an NMI. Only one handler may be
  * set. Return 1 if the NMI was handled.
  */
 void set_nmi_callback(nmi_callback_t callback);
- 
-/** 
+
+/**
  * unset_nmi_callback
  *
  * Remove the handler previously set.
  */
 void unset_nmi_callback(void);
- 
+
 #ifdef CONFIG_PM
  
 /** Replace the PM callback routine for NMI. */
@@ -56,4 +57,21 @@ extern int unknown_nmi_panic;
 
 extern int check_nmi_watchdog(void);
  
+extern void setup_apic_nmi_watchdog (void);
+extern int reserve_lapic_nmi(void);
+extern void release_lapic_nmi(void);
+extern void disable_timer_nmi_watchdog(void);
+extern void enable_timer_nmi_watchdog(void);
+extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
+
+extern void nmi_watchdog_default(void);
+extern int setup_nmi_watchdog(char *);
+
+extern unsigned int nmi_watchdog;
+#define NMI_DEFAULT	-1
+#define NMI_NONE	0
+#define NMI_IO_APIC	1
+#define NMI_LOCAL_APIC	2
+#define NMI_INVALID	3
+
 #endif /* ASM_NMI_H */
-- 
cgit v1.2.3


From 5c0f80fab3724aa44b3352d88155fe0eaae0e54a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:04 +0200
Subject: [PATCH] x86_64: Remove long obsolete CVS

Early development of x86-64 Linux was in CVS, but that hasn't been
the case for a long time now. Remove the obsolete $Id$s.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/fpu32.c       | 1 -
 arch/x86_64/ia32/ia32_signal.c | 2 --
 arch/x86_64/ia32/ptrace32.c    | 2 --
 arch/x86_64/ia32/sys_ia32.c    | 8 --------
 arch/x86_64/kernel/aperture.c  | 1 -
 arch/x86_64/kernel/e820.c      | 1 -
 arch/x86_64/kernel/head64.c    | 2 --
 arch/x86_64/kernel/process.c   | 1 -
 arch/x86_64/kernel/setup.c     | 2 --
 arch/x86_64/kernel/setup64.c   | 1 -
 arch/x86_64/kernel/signal.c    | 2 --
 arch/x86_64/kernel/traps.c     | 2 --
 include/asm-x86_64/dma.h       | 2 +-
 include/asm-x86_64/hw_irq.h    | 2 --
 14 files changed, 1 insertion(+), 28 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c
index 1c23095f1813..2c8209a3605a 100644
--- a/arch/x86_64/ia32/fpu32.c
+++ b/arch/x86_64/ia32/fpu32.c
@@ -2,7 +2,6 @@
  * Copyright 2002 Andi Kleen, SuSE Labs.
  * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
  * This is used for ptrace, signals and coredumps in 32bit emulation.
- * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $
  */ 
 
 #include <linux/sched.h>
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index e0a92439f634..25e5ca22204c 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -6,8 +6,6 @@
  *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-12-*   x86-64 compatibility mode signal handling by Andi Kleen
- * 
- *  $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $
  */
 
 #include <linux/sched.h>
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 4defe63f5916..a590b7a0d92d 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -7,8 +7,6 @@
  * 
  * This allows to access 64bit processes too; but there is no way to see the extended 
  * register contents.
- *
- * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $
  */ 
 
 #include <linux/kernel.h>
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index ee30557629dc..afbebde9326d 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -929,13 +929,5 @@ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
 	return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
 }
 
-static int __init ia32_init (void)
-{
-	printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");  
-	return 0;
-}
-
-__initcall(ia32_init);
-
 extern unsigned long ia32_sys_call_table[];
 EXPORT_SYMBOL(ia32_sys_call_table);
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index a7ad03ee98cf..9ea723bab16c 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -8,7 +8,6 @@
  * because only the bootmem allocator can allocate 32+MB. 
  * 
  * Copyright 2002 Andi Kleen, SuSE Labs.
- * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
  */
 #include <linux/config.h>
 #include <linux/kernel.h>
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 1ef6028f721e..d8ea250f3925 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -1,7 +1,6 @@
 /* 
  * Handle the memory map.
  * The functions here do the job until bootmem takes over.
- * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
  *
  *  Getting sanitize_e820_map() in sync with i386 version by applying change:
  *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cea20a66c150..e6a71c9556d9 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -2,8 +2,6 @@
  *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
- *
- *  $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
  */
 
 #include <linux/init.h>
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index fb903e65e079..52c03f6c7731 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -10,7 +10,6 @@
  *	Andi Kleen.
  *
  *	CPU hotplug support - ashok.raj@intel.com
- *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
 /*
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 64640c8f5eed..a91f1d94f9e5 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -5,8 +5,6 @@
  *
  *  Nov 2001 Dave Jones <davej@suse.de>
  *  Forked from i386 setup code.
- *
- *  $Id$
  */
 
 /*
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8a691fa6d393..e5bf22a01edb 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -3,7 +3,6 @@
  * Copyright (C) 1995  Linus Torvalds
  * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
  * See setup.c for older changelog.
- * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
  */ 
 #include <linux/config.h>
 #include <linux/init.h>
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index e5f5ce7909a3..94b4b1128ac8 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -7,8 +7,6 @@
  *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
- * 
- *  $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
  */
 
 #include <linux/sched.h>
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index cea335e8746c..9f8f1eff4a6c 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -6,8 +6,6 @@
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- *  $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
  */
 
 /*
diff --git a/include/asm-x86_64/dma.h b/include/asm-x86_64/dma.h
index c556208d3dd7..a37c16f06289 100644
--- a/include/asm-x86_64/dma.h
+++ b/include/asm-x86_64/dma.h
@@ -1,4 +1,4 @@
-/* $Id: dma.h,v 1.1.1.1 2001/04/19 20:00:38 ak Exp $
+/*
  * linux/include/asm/dma.h: Defines for using and allocating dma channels.
  * Written by Hennus Bergman, 1992.
  * High DMA channel support & info by Hannu Savolainen
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 3de96fd86a70..1b2ac55d3204 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -12,8 +12,6 @@
  *	<tomsoft@informatik.tu-chemnitz.de>
  *
  *	hacked by Andi Kleen for x86-64.
- * 
- *  $Id: hw_irq.h,v 1.24 2001/09/14 20:55:03 vojtech Exp $
  */
 
 #ifndef __ASSEMBLY__
-- 
cgit v1.2.3


From 08cd36570e47176c7b6bd3e80125aa46c4638097 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:10 +0200
Subject: [PATCH] x86_64: Optimize bitmap_weight for small bitmaps

Use inline code bitmaps <= BITS_PER_LONG in bitmap_weight. This
gives _much_ better code.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bitmap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index d9ed27969855..dcc5de7cc487 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -24,6 +24,9 @@
  * The available bitmap operations and their rough meaning in the
  * case that the bitmap is a single unsigned long are thus:
  *
+ * Note that nbits should be always a compile time evaluable constant.
+ * Otherwise many inlines will generate horrible code.
+ *
  * bitmap_zero(dst, nbits)			*dst = 0UL
  * bitmap_fill(dst, nbits)			*dst = ~0UL
  * bitmap_copy(dst, src, nbits)			*dst = *src
@@ -244,6 +247,8 @@ static inline int bitmap_full(const unsigned long *src, int nbits)
 
 static inline int bitmap_weight(const unsigned long *src, int nbits)
 {
+	if (nbits <= BITS_PER_LONG)
+		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
 	return __bitmap_weight(src, nbits);
 }
 
-- 
cgit v1.2.3


From 357c2b9056df447390b7df3e49960a4c609a89a9 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:57:13 +0200
Subject: [PATCH] x86_64: remove unused gart header file

include/asm-x86_64/gart-mapping.h is only ever used in
arch/x86_64/kernel/setup.c and none of its contents are referenced.
Looks to be leftover cruft not removed in the dma_ops patch.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c        |  1 -
 include/asm-x86_64/gart-mapping.h | 16 ----------------
 2 files changed, 17 deletions(-)
 delete mode 100644 include/asm-x86_64/gart-mapping.h

(limited to 'include')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a91f1d94f9e5..9dadb9a1db2a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -65,7 +65,6 @@
 #include <asm/numa.h>
 #include <asm/swiotlb.h>
 #include <asm/sections.h>
-#include <asm/gart-mapping.h>
 #include <asm/dmi.h>
 
 /*
diff --git a/include/asm-x86_64/gart-mapping.h b/include/asm-x86_64/gart-mapping.h
deleted file mode 100644
index ada497b0b55b..000000000000
--- a/include/asm-x86_64/gart-mapping.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _X8664_GART_MAPPING_H
-#define _X8664_GART_MAPPING_H 1
-
-#include <linux/types.h>
-#include <asm/types.h>
-
-struct device;
-
-extern void*
-gart_alloc_coherent(struct device *dev, size_t size,
-        dma_addr_t *dma_handle, gfp_t gfp);
-
-extern int
-gart_dma_supported(struct device *hwdev, u64 mask);
-
-#endif /* _X8664_GART_MAPPING_H */
-- 
cgit v1.2.3


From a813ce432f27c4f5011c7b5ac9d2bbbfeb41d9a7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:22 +0200
Subject: [PATCH] x86_64: Rename IOMMU option, fix help and mark option
 embedded.

 - Rename the GART_IOMMU option to IOMMU to make clear it's not
   just for AMD
 - Rewrite the help text to better emphatise this fact
 - Make it an embedded option because too many people get it wrong.

To my astonishment I discovered the aacraid driver tests this
symbol directly. This looks quite broken to me - it's an internal
implementation detail of the PCI DMA API. Can the maintainer
please clarify what this test was intended to do?

Cc: linux-scsi@vger.kernel.org
Cc: alan@redhat.com
Cc: markh@osdl.org
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig             | 30 ++++++++++++++++--------------
 arch/x86_64/Kconfig.debug       |  2 +-
 arch/x86_64/kernel/Makefile     |  2 +-
 arch/x86_64/kernel/io_apic.c    |  2 +-
 arch/x86_64/kernel/pci-dma.c    |  2 +-
 arch/x86_64/kernel/setup.c      |  2 +-
 drivers/char/agp/Kconfig        |  4 ++--
 drivers/char/agp/amd64-agp.c    |  4 ++--
 drivers/scsi/aacraid/comminit.c |  5 ++++-
 include/asm-x86_64/pci.h        |  2 +-
 include/asm-x86_64/proto.h      |  2 +-
 11 files changed, 31 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index fc75275d8c72..e8c52a1eec06 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -386,24 +386,26 @@ config HPET_EMULATE_RTC
 	bool "Provide RTC interrupt"
 	depends on HPET_TIMER && RTC=y
 
-config GART_IOMMU
-	bool "K8 GART IOMMU support"
+# Mark as embedded because too many people got it wrong.
+# The code disables itself when not needed.
+config IOMMU
+	bool "IOMMU support" if EMBEDDED
 	default y
 	select SWIOTLB
 	select AGP
 	depends on PCI
 	help
-	  Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors
-	  and for the bounce buffering software IOMMU.
-	  Needed to run systems with more than 3GB of memory properly with
-	  32-bit PCI devices that do not support DAC (Double Address Cycle).
-	  The IOMMU can be turned off at runtime with the iommu=off parameter.
-  	  Normally the kernel will take the right choice by itself.
-  	  This option includes a driver for the AMD Opteron/Athlon64 IOMMU
-  	  northbridge and a software emulation used on other systems without
-	  hardware IOMMU.  If unsure, say Y.
-
-# need this always selected by GART_IOMMU for the VIA workaround
+	  Support for full DMA access of devices with 32bit memory access only
+	  on systems with more than 3GB. This is usually needed for USB,
+	  sound, many IDE/SATA chipsets and some other devices.
+	  Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
+	  based IOMMU and a software bounce buffer based IOMMU used on Intel
+	  systems and as fallback.
+	  The code is only active when needed (enough memory and limited
+	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
+	  too.
+
+# need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
 
@@ -503,7 +505,7 @@ config REORDER
 
 config K8_NB
 	def_bool y
-	depends on AGP_AMD64 || GART_IOMMU || (PCI && NUMA)
+	depends on AGP_AMD64 || IOMMU || (PCI && NUMA)
 
 endmenu
 
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index ea31b4c62105..0226f08be874 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -13,7 +13,7 @@ config DEBUG_RODATA
 	 If in doubt, say "N".
 
 config IOMMU_DEBUG
-       depends on GART_IOMMU && DEBUG_KERNEL
+       depends on IOMMU && DEBUG_KERNEL
        bool "Enable IOMMU debugging"
        help
          Force the IOMMU to on even when you have less than 4GB of
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index f927d11065fe..fd106bdddd6d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,7 +28,7 @@ obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
+obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 519cd4e6f9e7..c768d8a036d0 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -319,7 +319,7 @@ void __init check_ioapic(void)
 				vendor &= 0xffff;
 				switch (vendor) { 
 				case PCI_VENDOR_ID_VIA:
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 					if ((end_pfn > MAX_DMA32_PFN ||
 					     force_iommu) &&
 					    !iommu_aperture_allowed) {
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 117bcebe9ebc..33926c367250 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -266,7 +266,7 @@ __init int iommu_setup(char *p)
 		    swiotlb = 1;
 #endif
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 	    gart_parse_options(p);
 #endif
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 9dadb9a1db2a..816203d57e11 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -702,7 +702,7 @@ void __init setup_arch(char **cmdline_p)
 
 	e820_setup_gap();
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 	iommu_hole_init();
 #endif
 
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 46685a540772..9826a399fa02 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -55,9 +55,9 @@ config AGP_AMD
 	  X on AMD Irongate, 761, and 762 chipsets.
 
 config AGP_AMD64
-	tristate "AMD Opteron/Athlon64 on-CPU GART support" if !GART_IOMMU
+	tristate "AMD Opteron/Athlon64 on-CPU GART support" if !IOMMU
 	depends on AGP && X86
-	default y if GART_IOMMU
+	default y if IOMMU
 	help
 	  This option gives you AGP support for the GLX component of
 	  X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs.
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 229d015757f9..f690ee8cb732 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -292,7 +292,7 @@ static int __devinit aperture_valid(u64 aper, u32 size)
 /*
  * W*s centric BIOS sometimes only set up the aperture in the AGP
  * bridge, not the northbridge. On AMD64 this is handled early
- * in aperture.c, but when GART_IOMMU is not enabled or we run
+ * in aperture.c, but when IOMMU is not enabled or we run
  * on a 32bit kernel this needs to be redone.
  * Unfortunately it is impossible to fix the aperture here because it's too late
  * to allocate that much memory. But at least error out cleanly instead of
@@ -775,7 +775,7 @@ static void __exit agp_amd64_cleanup(void)
 
 /* On AMD64 the PCI driver needs to initialize this driver early
    for the IOMMU, so it has to be called via a backdoor. */
-#ifndef CONFIG_GART_IOMMU
+#ifndef CONFIG_IOMMU
 module_init(agp_amd64_init);
 module_exit(agp_amd64_cleanup);
 #endif
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index 35b0a6ebd3f5..7cea514e810a 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -104,8 +104,11 @@ static int aac_alloc_comm(struct aac_dev *dev, void **commaddr, unsigned long co
 	 * always true on real computers. It also has some slight problems
 	 * with the GART on x86-64. I've btw never tried DMA from PCI space
 	 * on this platform but don't be surprised if its problematic.
+	 * [AK: something is very very wrong when a driver tests this symbol.
+ 	 *  Someone should figure out what the comment writer really meant here and fix
+	 *  the code. Or just remove that bad code. ]
 	 */
-#ifndef CONFIG_GART_IOMMU
+#ifndef CONFIG_IOMMU
 	if ((num_physpages << (PAGE_SHIFT - 12)) <= AAC_MAX_HOSTPHYSMEMPAGES) {
 		init->HostPhysMemPages = 
 			cpu_to_le32(num_physpages << (PAGE_SHIFT-12));
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 2db0620d5449..3374d34c4acd 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -52,7 +52,7 @@ extern int iommu_setup(char *opt);
  */
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 
 /*
  * x86-64 always supports DAC, but sometimes it is useful to force
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 8abf2a43c944..1064533e0959 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -116,7 +116,7 @@ extern int skip_ioapic_setup;
 extern int acpi_ht;
 extern int acpi_disabled;
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 extern int fallback_aper_order;
 extern int fallback_aper_force;
 extern int iommu_aperture;
-- 
cgit v1.2.3


From 4552d5dc08b79868829b4be8951b29b07284753f Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:28 +0200
Subject: [PATCH] x86_64: reliable stack trace support

These are the generic bits needed to enable reliable stack traces based
on Dwarf2-like (.eh_frame) unwind information. Subsequent patches will
enable x86-64 and i386 to make use of this.

Thanks to Andi Kleen and Ingo Molnar, who pointed out several possibilities
for improvement.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h |   7 +
 include/linux/module.h |   3 +
 include/linux/unwind.h | 119 +++++++
 init/main.c            |   2 +
 kernel/Makefile        |   1 +
 kernel/module.c        |  16 +-
 kernel/unwind.c        | 915 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug      |  12 +-
 8 files changed, 1072 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/unwind.h
 create mode 100644 kernel/unwind.c

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3c5e4c2e517d..5c1ec1f84eab 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -32,6 +32,7 @@ extern const char linux_banner[];
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 
 #define	KERN_EMERG	"<0>"	/* system is unusable			*/
 #define	KERN_ALERT	"<1>"	/* action must be taken immediately	*/
@@ -336,6 +337,12 @@ struct sysinfo {
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
+/* Force a compilation error if condition is true, but also produce a
+   result (of value 0 and type size_t), so the expression can be used
+   e.g. in a structure initializer (or where-ever else comma expressions
+   aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(char[1 - 2 * !!(e)]) - 1)
+
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 2d366098eab5..9ebbb74b7b72 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -285,6 +285,9 @@ struct module
 	/* The size of the executable code in each section.  */
 	unsigned long init_text_size, core_text_size;
 
+	/* The handle returned from unwind_add_table. */
+	void *unwind_info;
+
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
diff --git a/include/linux/unwind.h b/include/linux/unwind.h
new file mode 100644
index 000000000000..0295aa789ab4
--- /dev/null
+++ b/include/linux/unwind.h
@@ -0,0 +1,119 @@
+#ifndef _LINUX_UNWIND_H
+#define _LINUX_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/config.h>
+
+struct module;
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <asm/unwind.h>
+
+#ifndef ARCH_UNWIND_SECTION_NAME
+#define ARCH_UNWIND_SECTION_NAME ".eh_frame"
+#endif
+
+/*
+ * Initialize unwind support.
+ */
+extern void unwind_init(void);
+
+extern void *unwind_add_table(struct module *,
+                              const void *table_start,
+                              unsigned long table_size);
+
+extern void unwind_remove_table(void *handle, int init_only);
+
+extern int unwind_init_frame_info(struct unwind_frame_info *,
+                                  struct task_struct *,
+                                  /*const*/ struct pt_regs *);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+extern int unwind_init_blocked(struct unwind_frame_info *,
+                               struct task_struct *);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+extern int unwind_init_running(struct unwind_frame_info *,
+                               asmlinkage void (*callback)(struct unwind_frame_info *,
+                                                           void *arg),
+                               void *arg);
+
+/*
+ * Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error.
+ */
+extern int unwind(struct unwind_frame_info *);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+extern int unwind_to_user(struct unwind_frame_info *);
+
+#else
+
+struct unwind_frame_info {};
+
+static inline void unwind_init(void) {}
+
+static inline void *unwind_add_table(struct module *mod,
+                                     const void *table_start,
+                                     unsigned long table_size)
+{
+	return NULL;
+}
+
+static inline void unwind_remove_table(void *handle, int init_only)
+{
+}
+
+static inline int unwind_init_frame_info(struct unwind_frame_info *info,
+                                         struct task_struct *tsk,
+                                         const struct pt_regs *regs)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_init_blocked(struct unwind_frame_info *info,
+                                      struct task_struct *tsk)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_init_running(struct unwind_frame_info *info,
+                                      asmlinkage void (*cb)(struct unwind_frame_info *,
+                                                            void *arg),
+                                      void *arg)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind(struct unwind_frame_info *info)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_to_user(struct unwind_frame_info *info)
+{
+	return -ENOSYS;
+}
+
+#endif
+
+#endif /* _LINUX_UNWIND_H */
diff --git a/init/main.c b/init/main.c
index f715b9b89753..f556fd0a0b66 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/unwind.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -482,6 +483,7 @@ asmlinkage void __init start_kernel(void)
 		   __stop___param - __start___param,
 		   &unknown_bootoption);
 	sort_main_extable();
+	unwind_init();
 	trap_init();
 	rcu_init();
 	init_IRQ();
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..a31276e190f5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_STACK_UNWIND) += unwind.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/module.c b/kernel/module.c
index d75275de1c28..08811e26ac9d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/unwind.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -1051,6 +1052,8 @@ static void free_module(struct module *mod)
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
+	unwind_remove_table(mod->unwind_info, 0);
+
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -1412,7 +1415,7 @@ static struct module *load_module(void __user *umod,
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
 		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
-		gplfuturecrcindex;
+		gplfuturecrcindex, unwindex = 0;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1502,6 +1505,9 @@ static struct module *load_module(void __user *umod,
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
 	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+#ifdef ARCH_UNWIND_SECTION_NAME
+	unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
+#endif
 
 	/* Don't keep modinfo section */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1516,8 @@ static struct module *load_module(void __user *umod,
 	sechdrs[symindex].sh_flags |= SHF_ALLOC;
 	sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
+	if (unwindex)
+		sechdrs[unwindex].sh_flags |= SHF_ALLOC;
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1738,6 +1746,11 @@ static struct module *load_module(void __user *umod,
 		goto arch_cleanup;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
+	/* Size of section 0 is 0, so this works well if no unwind info. */
+	mod->unwind_info = unwind_add_table(mod,
+	                                    (void *)sechdrs[unwindex].sh_addr,
+	                                    sechdrs[unwindex].sh_size);
+
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
@@ -1836,6 +1849,7 @@ sys_init_module(void __user *umod,
 	mod->state = MODULE_STATE_LIVE;
 	/* Drop initial reference. */
 	module_put(mod);
+	unwind_remove_table(mod->unwind_info, 1);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..d36bcd3ad3b5
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,915 @@
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/unwind.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+
+extern char __start_unwind[], __end_unwind[];
+
+#define MAX_STACK_DEPTH 8
+
+#define EXTRA_INFO(f) { \
+		BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
+		                  % FIELD_SIZEOF(struct unwind_frame_info, f)) \
+		+ offsetof(struct unwind_frame_info, f) \
+		  / FIELD_SIZEOF(struct unwind_frame_info, f), \
+		FIELD_SIZEOF(struct unwind_frame_info, f) \
+	}
+#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
+
+static const struct {
+	unsigned offs:BITS_PER_LONG / 2;
+	unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+	UNW_REGISTER_INFO
+};
+
+#undef PTREGS_INFO
+#undef EXTRA_INFO
+
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+
+#define DW_CFA_nop                          0x00
+#define DW_CFA_set_loc                      0x01
+#define DW_CFA_advance_loc1                 0x02
+#define DW_CFA_advance_loc2                 0x03
+#define DW_CFA_advance_loc4                 0x04
+#define DW_CFA_offset_extended              0x05
+#define DW_CFA_restore_extended             0x06
+#define DW_CFA_undefined                    0x07
+#define DW_CFA_same_value                   0x08
+#define DW_CFA_register                     0x09
+#define DW_CFA_remember_state               0x0a
+#define DW_CFA_restore_state                0x0b
+#define DW_CFA_def_cfa                      0x0c
+#define DW_CFA_def_cfa_register             0x0d
+#define DW_CFA_def_cfa_offset               0x0e
+#define DW_CFA_def_cfa_expression           0x0f
+#define DW_CFA_expression                   0x10
+#define DW_CFA_offset_extended_sf           0x11
+#define DW_CFA_def_cfa_sf                   0x12
+#define DW_CFA_def_cfa_offset_sf            0x13
+#define DW_CFA_val_offset                   0x14
+#define DW_CFA_val_offset_sf                0x15
+#define DW_CFA_val_expression               0x16
+#define DW_CFA_lo_user                      0x1c
+#define DW_CFA_GNU_window_save              0x2d
+#define DW_CFA_GNU_args_size                0x2e
+#define DW_CFA_GNU_negative_offset_extended 0x2f
+#define DW_CFA_hi_user                      0x3f
+
+#define DW_EH_PE_FORM     0x07
+#define DW_EH_PE_native   0x00
+#define DW_EH_PE_leb128   0x01
+#define DW_EH_PE_data2    0x02
+#define DW_EH_PE_data4    0x03
+#define DW_EH_PE_data8    0x04
+#define DW_EH_PE_signed   0x08
+#define DW_EH_PE_ADJUST   0x70
+#define DW_EH_PE_abs      0x00
+#define DW_EH_PE_pcrel    0x10
+#define DW_EH_PE_textrel  0x20
+#define DW_EH_PE_datarel  0x30
+#define DW_EH_PE_funcrel  0x40
+#define DW_EH_PE_aligned  0x50
+#define DW_EH_PE_indirect 0x80
+#define DW_EH_PE_omit     0xff
+
+typedef unsigned long uleb128_t;
+typedef   signed long sleb128_t;
+
+static struct unwind_table {
+	struct {
+		unsigned long pc;
+		unsigned long range;
+	} core, init;
+	const void *address;
+	unsigned long size;
+	struct unwind_table *link;
+	const char *name;
+} root_table, *last_table;
+
+struct unwind_item {
+	enum item_location {
+		Nowhere,
+		Memory,
+		Register,
+		Value
+	} where;
+	uleb128_t value;
+};
+
+struct unwind_state {
+	uleb128_t loc, org;
+	const u8 *cieStart, *cieEnd;
+	uleb128_t codeAlign;
+	sleb128_t dataAlign;
+	struct cfa {
+		uleb128_t reg, offs;
+	} cfa;
+	struct unwind_item regs[ARRAY_SIZE(reg_info)];
+	unsigned stackDepth:8;
+	unsigned version:8;
+	const u8 *label;
+	const u8 *stack[MAX_STACK_DEPTH];
+};
+
+static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+
+static struct unwind_table *find_table(unsigned long pc)
+{
+	struct unwind_table *table;
+
+	for (table = &root_table; table; table = table->link)
+		if ((pc >= table->core.pc
+		     && pc < table->core.pc + table->core.range)
+		    || (pc >= table->init.pc
+		        && pc < table->init.pc + table->init.range))
+			break;
+
+	return table;
+}
+
+static void init_unwind_table(struct unwind_table *table,
+                              const char *name,
+                              const void *core_start,
+                              unsigned long core_size,
+                              const void *init_start,
+                              unsigned long init_size,
+                              const void *table_start,
+                              unsigned long table_size)
+{
+	table->core.pc = (unsigned long)core_start;
+	table->core.range = core_size;
+	table->init.pc = (unsigned long)init_start;
+	table->init.range = init_size;
+	table->address = table_start;
+	table->size = table_size;
+	table->link = NULL;
+	table->name = name;
+}
+
+void __init unwind_init(void)
+{
+	init_unwind_table(&root_table, "kernel",
+	                  _text, _end - _text,
+	                  NULL, 0,
+	                  __start_unwind, __end_unwind - __start_unwind);
+}
+
+/* Must be called with module_mutex held. */
+void *unwind_add_table(struct module *module,
+                       const void *table_start,
+                       unsigned long table_size)
+{
+	struct unwind_table *table;
+
+	if (table_size <= 0)
+		return NULL;
+
+	table = kmalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	init_unwind_table(table, module->name,
+	                  module->module_core, module->core_size,
+	                  module->module_init, module->init_size,
+	                  table_start, table_size);
+
+	if (last_table)
+		last_table->link = table;
+	else
+		root_table.link = table;
+	last_table = table;
+
+	return table;
+}
+
+struct unlink_table_info
+{
+	struct unwind_table *table;
+	int init_only;
+};
+
+static int unlink_table(void *arg)
+{
+	struct unlink_table_info *info = arg;
+	struct unwind_table *table = info->table, *prev;
+
+	for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
+		;
+
+	if (prev->link) {
+		if (info->init_only) {
+			table->init.pc = 0;
+			table->init.range = 0;
+			info->table = NULL;
+		} else {
+			prev->link = table->link;
+			if (!prev->link)
+				last_table = prev;
+		}
+	} else
+		info->table = NULL;
+
+	return 0;
+}
+
+/* Must be called with module_mutex held. */
+void unwind_remove_table(void *handle, int init_only)
+{
+	struct unwind_table *table = handle;
+	struct unlink_table_info info;
+
+	if (!table || table == &root_table)
+		return;
+
+	if (init_only && table == last_table) {
+		table->init.pc = 0;
+		table->init.range = 0;
+		return;
+	}
+
+	info.table = table;
+	info.init_only = init_only;
+	stop_machine_run(unlink_table, &info, NR_CPUS);
+
+	if (info.table)
+		kfree(table);
+}
+
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	uleb128_t value;
+	unsigned shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (uleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur++ & 0x80))
+			break;
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	sleb128_t value;
+	unsigned shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (sleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur & 0x80)) {
+			value |= -(*cur++ & 0x40) << shift;
+			break;
+		}
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType)
+{
+	unsigned long value = 0;
+	union {
+		const u8 *p8;
+		const u16 *p16u;
+		const s16 *p16s;
+		const u32 *p32u;
+		const s32 *p32s;
+		const unsigned long *pul;
+	} ptr;
+
+	if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+		return 0;
+	ptr.p8 = *pLoc;
+	switch(ptrType & DW_EH_PE_FORM) {
+	case DW_EH_PE_data2:
+		if (end < (const void *)(ptr.p16u + 1))
+			return 0;
+		if(ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.p16s++);
+		else
+			value = get_unaligned(ptr.p16u++);
+		break;
+	case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+		if (end < (const void *)(ptr.p32u + 1))
+			return 0;
+		if(ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.p32s++);
+		else
+			value = get_unaligned(ptr.p32u++);
+		break;
+	case DW_EH_PE_data8:
+		BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+		BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+	case DW_EH_PE_native:
+		if (end < (const void *)(ptr.pul + 1))
+			return 0;
+		value = get_unaligned(ptr.pul++);
+		break;
+	case DW_EH_PE_leb128:
+		BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+		value = ptrType & DW_EH_PE_signed
+		        ? get_sleb128(&ptr.p8, end)
+		        : get_uleb128(&ptr.p8, end);
+		if ((const void *)ptr.p8 > end)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+	switch(ptrType & DW_EH_PE_ADJUST) {
+	case DW_EH_PE_abs:
+		break;
+	case DW_EH_PE_pcrel:
+		value += (unsigned long)*pLoc;
+		break;
+	default:
+		return 0;
+	}
+	if ((ptrType & DW_EH_PE_indirect)
+	    && __get_user(value, (unsigned long *)value))
+		return 0;
+	*pLoc = ptr.p8;
+
+	return value;
+}
+
+static signed fde_pointer_type(const u32 *cie)
+{
+	const u8 *ptr = (const u8 *)(cie + 2);
+	unsigned version = *ptr;
+
+	if (version != 1)
+		return -1; /* unsupported */
+	if (*++ptr) {
+		const char *aug;
+		const u8 *end = (const u8 *)(cie + 1) + *cie;
+		uleb128_t len;
+
+		/* check if augmentation size is first (and thus present) */
+		if (*ptr != 'z')
+			return -1;
+		/* check if augmentation string is nul-terminated */
+		if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
+			return -1;
+		++ptr; /* skip terminator */
+		get_uleb128(&ptr, end); /* skip code alignment */
+		get_sleb128(&ptr, end); /* skip data alignment */
+		/* skip return address column */
+		version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
+		len = get_uleb128(&ptr, end); /* augmentation length */
+		if (ptr + len < ptr || ptr + len > end)
+			return -1;
+		end = ptr + len;
+		while (*++aug) {
+			if (ptr >= end)
+				return -1;
+			switch(*aug) {
+			case 'L':
+				++ptr;
+				break;
+			case 'P': {
+					signed ptrType = *ptr++;
+
+					if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+						return -1;
+				}
+				break;
+			case 'R':
+				return *ptr;
+			default:
+				return -1;
+			}
+		}
+	}
+	return DW_EH_PE_native|DW_EH_PE_abs;
+}
+
+static int advance_loc(unsigned long delta, struct unwind_state *state)
+{
+	state->loc += delta * state->codeAlign;
+
+	return delta > 0;
+}
+
+static void set_rule(uleb128_t reg,
+                     enum item_location where,
+                     uleb128_t value,
+                     struct unwind_state *state)
+{
+	if (reg < ARRAY_SIZE(state->regs)) {
+		state->regs[reg].where = where;
+		state->regs[reg].value = value;
+	}
+}
+
+static int processCFI(const u8 *start,
+                      const u8 *end,
+                      unsigned long targetLoc,
+                      signed ptrType,
+                      struct unwind_state *state)
+{
+	union {
+		const u8 *p8;
+		const u16 *p16;
+		const u32 *p32;
+	} ptr;
+	int result = 1;
+
+	if (start != state->cieStart) {
+		state->loc = state->org;
+		result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
+		if (targetLoc == 0 && state->label == NULL)
+			return result;
+	}
+	for (ptr.p8 = start; result && ptr.p8 < end; ) {
+		switch(*ptr.p8 >> 6) {
+			uleb128_t value;
+
+		case 0:
+			switch(*ptr.p8++) {
+			case DW_CFA_nop:
+				break;
+			case DW_CFA_set_loc:
+				if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+					result = 0;
+				break;
+			case DW_CFA_advance_loc1:
+				result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
+				break;
+			case DW_CFA_advance_loc2:
+				result = ptr.p8 <= end + 2
+				         && advance_loc(*ptr.p16++, state);
+				break;
+			case DW_CFA_advance_loc4:
+				result = ptr.p8 <= end + 4
+				         && advance_loc(*ptr.p32++, state);
+				break;
+			case DW_CFA_offset_extended:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_val_offset:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_offset_extended_sf:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_val_offset_sf:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_restore_extended:
+			case DW_CFA_undefined:
+			case DW_CFA_same_value:
+				set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
+				break;
+			case DW_CFA_register:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value,
+				         Register,
+				         get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_remember_state:
+				if (ptr.p8 == state->label) {
+					state->label = NULL;
+					return 1;
+				}
+				if (state->stackDepth >= MAX_STACK_DEPTH)
+					return 0;
+				state->stack[state->stackDepth++] = ptr.p8;
+				break;
+			case DW_CFA_restore_state:
+				if (state->stackDepth) {
+					const uleb128_t loc = state->loc;
+					const u8 *label = state->label;
+
+					state->label = state->stack[state->stackDepth - 1];
+					memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
+					memset(state->regs, 0, sizeof(state->regs));
+					state->stackDepth = 0;
+					result = processCFI(start, end, 0, ptrType, state);
+					state->loc = loc;
+					state->label = label;
+				} else
+					return 0;
+				break;
+			case DW_CFA_def_cfa:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				/*nobreak*/
+			case DW_CFA_def_cfa_offset:
+				state->cfa.offs = get_uleb128(&ptr.p8, end);
+				break;
+			case DW_CFA_def_cfa_sf:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				/*nobreak*/
+			case DW_CFA_def_cfa_offset_sf:
+				state->cfa.offs = get_sleb128(&ptr.p8, end)
+				                  * state->dataAlign;
+				break;
+			case DW_CFA_def_cfa_register:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				break;
+			/*todo case DW_CFA_def_cfa_expression: */
+			/*todo case DW_CFA_expression: */
+			/*todo case DW_CFA_val_expression: */
+			case DW_CFA_GNU_args_size:
+				get_uleb128(&ptr.p8, end);
+				break;
+			case DW_CFA_GNU_negative_offset_extended:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value,
+				         Memory,
+				         (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_GNU_window_save:
+			default:
+				result = 0;
+				break;
+			}
+			break;
+		case 1:
+			result = advance_loc(*ptr.p8++ & 0x3f, state);
+			break;
+		case 2:
+			value = *ptr.p8++ & 0x3f;
+			set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+			break;
+		case 3:
+			set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
+			break;
+		}
+		if (ptr.p8 > end)
+			result = 0;
+		if (result && targetLoc != 0 && targetLoc < state->loc)
+			return 1;
+	}
+
+	return result
+	   && ptr.p8 == end
+	   && (targetLoc == 0
+	    || (/*todo While in theory this should apply, gcc in practice omits
+	          everything past the function prolog, and hence the location
+	          never reaches the end of the function.
+	        targetLoc < state->loc &&*/ state->label == NULL));
+}
+
+/* Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error. */
+int unwind(struct unwind_frame_info *frame)
+{
+#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
+	const u32 *fde = NULL, *cie = NULL;
+	const u8 *ptr = NULL, *end = NULL;
+	unsigned long startLoc = 0, endLoc = 0, cfa;
+	unsigned i;
+	signed ptrType = -1;
+	uleb128_t retAddrReg = 0;
+	struct unwind_table *table;
+	struct unwind_state state;
+
+	if (UNW_PC(frame) == 0)
+		return -EINVAL;
+	if ((table = find_table(UNW_PC(frame))) != NULL
+	    && !(table->size & (sizeof(*fde) - 1))) {
+		unsigned long tableSize = table->size;
+
+		for (fde = table->address;
+		     tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+		     tableSize -= sizeof(*fde) + *fde,
+		     fde += 1 + *fde / sizeof(*fde)) {
+			if (!*fde || (*fde & (sizeof(*fde) - 1)))
+				break;
+			if (!fde[1])
+				continue; /* this is a CIE */
+			if ((fde[1] & (sizeof(*fde) - 1))
+			    || fde[1] > (unsigned long)(fde + 1)
+			                - (unsigned long)table->address)
+				continue; /* this is not a valid FDE */
+			cie = fde + 1 - fde[1] / sizeof(*fde);
+			if (*cie <= sizeof(*cie) + 4
+			    || *cie >= fde[1] - sizeof(*fde)
+			    || (*cie & (sizeof(*cie) - 1))
+			    || cie[1]
+			    || (ptrType = fde_pointer_type(cie)) < 0) {
+				cie = NULL; /* this is not a (valid) CIE */
+				continue;
+			}
+			ptr = (const u8 *)(fde + 2);
+			startLoc = read_pointer(&ptr,
+			                        (const u8 *)(fde + 1) + *fde,
+			                        ptrType);
+			endLoc = startLoc
+			         + read_pointer(&ptr,
+			                        (const u8 *)(fde + 1) + *fde,
+			                        ptrType & DW_EH_PE_indirect
+			                        ? ptrType
+			                        : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
+			if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+				break;
+			cie = NULL;
+		}
+	}
+	if (cie != NULL) {
+		memset(&state, 0, sizeof(state));
+		state.cieEnd = ptr; /* keep here temporarily */
+		ptr = (const u8 *)(cie + 2);
+		end = (const u8 *)(cie + 1) + *cie;
+		if ((state.version = *ptr) != 1)
+			cie = NULL; /* unsupported version */
+		else if (*++ptr) {
+			/* check if augmentation size is first (and thus present) */
+			if (*ptr == 'z') {
+				/* check for ignorable (or already handled)
+				 * nul-terminated augmentation string */
+				while (++ptr < end && *ptr)
+					if (strchr("LPR", *ptr) == NULL)
+						break;
+			}
+			if (ptr >= end || *ptr)
+				cie = NULL;
+		}
+		++ptr;
+	}
+	if (cie != NULL) {
+		/* get code aligment factor */
+		state.codeAlign = get_uleb128(&ptr, end);
+		/* get data aligment factor */
+		state.dataAlign = get_sleb128(&ptr, end);
+		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
+			cie = NULL;
+		else {
+			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+			/* skip augmentation */
+			if (((const char *)(cie + 2))[1] == 'z')
+				ptr += get_uleb128(&ptr, end);
+			if (ptr > end
+			   || retAddrReg >= ARRAY_SIZE(reg_info)
+			   || REG_INVALID(retAddrReg)
+			   || reg_info[retAddrReg].width != sizeof(unsigned long))
+				cie = NULL;
+		}
+	}
+	if (cie != NULL) {
+		state.cieStart = ptr;
+		ptr = state.cieEnd;
+		state.cieEnd = end;
+		end = (const u8 *)(fde + 1) + *fde;
+		/* skip augmentation */
+		if (((const char *)(cie + 2))[1] == 'z') {
+			uleb128_t augSize = get_uleb128(&ptr, end);
+
+			if ((ptr += augSize) > end)
+				fde = NULL;
+		}
+	}
+	if (cie == NULL || fde == NULL) {
+#ifdef CONFIG_FRAME_POINTER
+		unsigned long top, bottom;
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+		top = STACK_TOP(frame->task);
+		bottom = STACK_BOTTOM(frame->task);
+# if FRAME_RETADDR_OFFSET < 0
+		if (UNW_SP(frame) < top
+		    && UNW_FP(frame) <= UNW_SP(frame)
+		    && bottom < UNW_FP(frame)
+# else
+		if (UNW_SP(frame) > top
+		    && UNW_FP(frame) >= UNW_SP(frame)
+		    && bottom > UNW_FP(frame)
+# endif
+		   && !((UNW_SP(frame) | UNW_FP(frame))
+		        & (sizeof(unsigned long) - 1))) {
+			unsigned long link;
+
+			if (!__get_user(link,
+			                (unsigned long *)(UNW_FP(frame)
+			                                  + FRAME_LINK_OFFSET))
+# if FRAME_RETADDR_OFFSET < 0
+			   && link > bottom && link < UNW_FP(frame)
+# else
+			   && link > UNW_FP(frame) && link < bottom
+# endif
+			   && !(link & (sizeof(link) - 1))
+			   && !__get_user(UNW_PC(frame),
+			                  (unsigned long *)(UNW_FP(frame)
+			                                    + FRAME_RETADDR_OFFSET))) {
+				UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
+# if FRAME_RETADDR_OFFSET < 0
+					-
+# else
+					+
+# endif
+					  sizeof(UNW_PC(frame));
+				UNW_FP(frame) = link;
+				return 0;
+			}
+		}
+#endif
+		return -ENXIO;
+	}
+	state.org = startLoc;
+	memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
+	/* process instructions */
+	if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+	   || state.loc > endLoc
+	   || state.regs[retAddrReg].where == Nowhere
+	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
+	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+	   || state.cfa.offs % sizeof(unsigned long))
+		return -EIO;
+	/* update frame */
+	cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
+	startLoc = min((unsigned long)UNW_SP(frame), cfa);
+	endLoc = max((unsigned long)UNW_SP(frame), cfa);
+	if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
+		startLoc = min(STACK_LIMIT(cfa), cfa);
+		endLoc = max(STACK_LIMIT(cfa), cfa);
+	}
+#ifndef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32)
+#else
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#endif
+	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+		if (REG_INVALID(i)) {
+			if (state.regs[i].where == Nowhere)
+				continue;
+			return -EIO;
+		}
+		switch(state.regs[i].where) {
+		default:
+			break;
+		case Register:
+			if (state.regs[i].value >= ARRAY_SIZE(reg_info)
+			   || REG_INVALID(state.regs[i].value)
+			   || reg_info[i].width > reg_info[state.regs[i].value].width)
+				return -EIO;
+			switch(reg_info[state.regs[i].value].width) {
+#define CASE(n) \
+			case sizeof(u##n): \
+				state.regs[i].value = FRAME_REG(state.regs[i].value, \
+				                                const u##n); \
+				break
+			CASES;
+#undef CASE
+			default:
+				return -EIO;
+			}
+			break;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+		if (REG_INVALID(i))
+			continue;
+		switch(state.regs[i].where) {
+		case Nowhere:
+			if (reg_info[i].width != sizeof(UNW_SP(frame))
+			   || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
+			      != &UNW_SP(frame))
+				continue;
+			UNW_SP(frame) = cfa;
+			break;
+		case Register:
+			switch(reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+				FRAME_REG(i, u##n) = state.regs[i].value; \
+				break
+			CASES;
+#undef CASE
+			default:
+				return -EIO;
+			}
+			break;
+		case Value:
+			if (reg_info[i].width != sizeof(unsigned long))
+				return -EIO;
+			FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
+			                                    * state.dataAlign;
+			break;
+		case Memory: {
+				unsigned long addr = cfa + state.regs[i].value
+				                           * state.dataAlign;
+
+				if ((state.regs[i].value * state.dataAlign)
+				    % sizeof(unsigned long)
+				    || addr < startLoc
+				    || addr + sizeof(unsigned long) < addr
+				    || addr + sizeof(unsigned long) > endLoc)
+					return -EIO;
+				switch(reg_info[i].width) {
+#define CASE(n)     case sizeof(u##n): \
+					__get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+					break
+				CASES;
+#undef CASE
+				default:
+					return -EIO;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+#undef CASES
+#undef FRAME_REG
+}
+EXPORT_SYMBOL(unwind);
+
+int unwind_init_frame_info(struct unwind_frame_info *info,
+                           struct task_struct *tsk,
+                           /*const*/ struct pt_regs *regs)
+{
+	info->task = tsk;
+	arch_unw_init_frame_info(info, regs);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_frame_info);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+int unwind_init_blocked(struct unwind_frame_info *info,
+                        struct task_struct *tsk)
+{
+	info->task = tsk;
+	arch_unw_init_blocked(info);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_blocked);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+int unwind_init_running(struct unwind_frame_info *info,
+                        asmlinkage void (*callback)(struct unwind_frame_info *,
+                                                    void *arg),
+                        void *arg)
+{
+	info->task = current;
+	arch_unwind_init_running(info, callback, arg);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_running);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+int unwind_to_user(struct unwind_frame_info *info)
+{
+	while (!arch_unw_user_mode(info)) {
+		int err = unwind(info);
+
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_to_user);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ccb0c1fdf1b5..256b3b805c5c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -188,14 +188,22 @@ config FRAME_POINTER
 
 config UNWIND_INFO
 	bool "Compile the kernel with frame unwind information"
-	depends on !IA64
-	depends on !MODULES || !(MIPS || PARISC || PPC || SUPERH || V850)
+	depends on !IA64 && !PARISC
+	depends on !MODULES || !(MIPS || PPC || SUPERH || V850)
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
 	  but not slower, and it will give very useful debugging information.
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame unwind information or frame pointers.
 
+config STACK_UNWIND
+	bool "Stack unwind support"
+	depends on UNWIND_INFO
+	depends on n
+	help
+	  This enables more precise stack traces, omitting all unrelated
+	  occurrences of pointers into kernel code from the dump.
+
 config FORCED_INLINING
 	bool "Force gcc to inline functions marked 'inline'"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From b538ed278b80641ee6d7b75497d5e14de1bb1137 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:32 +0200
Subject: [PATCH] x86_64: reliable stack trace support (x86-64)

These are the x86_64-specific pieces to enable reliable stack traces. The
only restriction with this is that it currently cannot unwind across the
interrupt->normal stack boundary, as that transition is lacking proper
annotation.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S       |  33 ++++++++++++
 arch/x86_64/kernel/process.c     |   2 +-
 arch/x86_64/kernel/traps.c       |  54 +++++++++++++++++---
 arch/x86_64/kernel/vmlinux.lds.S |   9 ++++
 include/asm-x86_64/proto.h       |   2 +-
 include/asm-x86_64/unwind.h      | 106 +++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug                |   2 +-
 7 files changed, 199 insertions(+), 9 deletions(-)
 create mode 100644 include/asm-x86_64/unwind.h

(limited to 'include')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 9999d703b6c4..6c68beec51b3 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -1089,3 +1089,36 @@ ENTRY(call_softirq)
 	ret
 	CFI_ENDPROC
 ENDPROC(call_softirq)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
+	movq	%r15, R15(%rdi)
+	movq	%r14, R14(%rdi)
+	xchgq	%rsi, %rdx
+	movq	%r13, R13(%rdi)
+	movq	%r12, R12(%rdi)
+	xorl	%eax, %eax
+	movq	%rbp, RBP(%rdi)
+	movq	%rbx, RBX(%rdi)
+	movq	(%rsp), %rcx
+	movq	%rax, R11(%rdi)
+	movq	%rax, R10(%rdi)
+	movq	%rax, R9(%rdi)
+	movq	%rax, R8(%rdi)
+	movq	%rax, RAX(%rdi)
+	movq	%rax, RCX(%rdi)
+	movq	%rax, RDX(%rdi)
+	movq	%rax, RSI(%rdi)
+	movq	%rax, RDI(%rdi)
+	movq	%rax, ORIG_RAX(%rdi)
+	movq	%rcx, RIP(%rdi)
+	leaq	8(%rsp), %rcx
+	movq	$__KERNEL_CS, CS(%rdi)
+	movq	%rax, EFLAGS(%rdi)
+	movq	%rcx, RSP(%rdi)
+	movq	$__KERNEL_DS, SS(%rdi)
+	jmpq	*%rdx
+	CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dcb77b462348..d6fa41459c80 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -334,7 +334,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(NULL, regs, (void *)(regs + 1));
 }
 
 /*
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 9f8f1eff4a6c..eb1534ff1f5f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/nmi.h>
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
+#include <linux/unwind.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -39,7 +40,7 @@
 #include <asm/i387.h>
 #include <asm/kdebug.h>
 #include <asm/processor.h>
-
+#include <asm/unwind.h>
 #include <asm/smp.h>
 #include <asm/pgalloc.h>
 #include <asm/pda.h>
@@ -189,6 +190,23 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
+static void show_trace_unwind(struct unwind_frame_info *info, void *context)
+{
+	int i = 11;
+
+	while (unwind(info) == 0 && UNW_PC(info)) {
+		if (i > 50) {
+			printk("\n       ");
+			i = 7;
+		} else
+			i += printk(" ");
+		i += printk_address(UNW_PC(info));
+		if (arch_unw_user_mode(info))
+			break;
+	}
+	printk("\n");
+}
+
 /*
  * x86-64 can have upto three kernel stacks: 
  * process stack
@@ -196,15 +214,34 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	int i;
 	unsigned used = 0;
+	struct unwind_frame_info info;
 
 	printk("\nCall Trace:");
 
+	if (!tsk)
+		tsk = current;
+
+	if (regs) {
+		if (unwind_init_frame_info(&info, tsk, regs) == 0) {
+			show_trace_unwind(&info, NULL);
+			return;
+		}
+	} else if (tsk == current) {
+		if (unwind_init_running(&info, show_trace_unwind, NULL) == 0)
+			return;
+	} else {
+		if (unwind_init_blocked(&info, tsk) == 0) {
+			show_trace_unwind(&info, NULL);
+			return;
+		}
+	}
+
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
@@ -262,7 +299,7 @@ void show_trace(unsigned long *stack)
 	printk("\n");
 }
 
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
 {
 	unsigned long *stack;
 	int i;
@@ -296,7 +333,12 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
 		printk("%016lx ", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, regs, rsp);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+	_show_stack(tsk, NULL, rsp);
 }
 
 /*
@@ -305,7 +347,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(NULL, NULL, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -332,7 +374,7 @@ void show_registers(struct pt_regs *regs)
 	if (in_kernel) {
 
 		printk("Stack: ");
-		show_stack(NULL, (unsigned long*)rsp);
+		_show_stack(NULL, regs, (unsigned long*)rsp);
 
 		printk("\nCode: ");
 		if (regs->rip < PAGE_OFFSET)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 5968c2415da9..1c6a5f322919 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -45,6 +45,15 @@ SECTIONS
 
   RODATA
 
+#ifdef CONFIG_STACK_UNWIND
+  . = ALIGN(8);
+  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
+	__start_unwind = .;
+  	*(.eh_frame)
+	__end_unwind = .;
+  }
+#endif
+
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 1064533e0959..105476ff8039 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -75,7 +75,7 @@ extern void main_timer_handler(struct pt_regs *regs);
 
 extern unsigned long end_pfn_map; 
 
-extern void show_trace(unsigned long * rsp);
+extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp);
 extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
new file mode 100644
index 000000000000..4f61de246179
--- /dev/null
+++ b/include/asm-x86_64/unwind.h
@@ -0,0 +1,106 @@
+#ifndef _ASM_X86_64_UNWIND_H
+#define _ASM_X86_64_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ */
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+#include <asm/vsyscall.h>
+
+struct unwind_frame_info
+{
+	struct pt_regs regs;
+	struct task_struct *task;
+};
+
+#define UNW_PC(frame)        (frame)->regs.rip
+#define UNW_SP(frame)        (frame)->regs.rsp
+#ifdef CONFIG_FRAME_POINTER
+#define UNW_FP(frame)        (frame)->regs.rbp
+#define FRAME_RETADDR_OFFSET 8
+#define FRAME_LINK_OFFSET    0
+#define STACK_BOTTOM(tsk)    (((tsk)->thread.rsp0 - 1) & ~(THREAD_SIZE - 1))
+#define STACK_TOP(tsk)       ((tsk)->thread.rsp0)
+#endif
+/* Might need to account for the special exception and interrupt handling
+   stacks here, since normally
+	EXCEPTION_STACK_ORDER < THREAD_ORDER < IRQSTACK_ORDER,
+   but the construct is needed only for getting across the stack switch to
+   the interrupt stack - thus considering the IRQ stack itself is unnecessary,
+   and the overhead of comparing against all exception handling stacks seems
+   not desirable. */
+#define STACK_LIMIT(ptr)     (((ptr) - 1) & ~(THREAD_SIZE - 1))
+
+#define UNW_REGISTER_INFO \
+	PTREGS_INFO(rax), \
+	PTREGS_INFO(rdx), \
+	PTREGS_INFO(rcx), \
+	PTREGS_INFO(rbx), \
+	PTREGS_INFO(rsi), \
+	PTREGS_INFO(rdi), \
+	PTREGS_INFO(rbp), \
+	PTREGS_INFO(rsp), \
+	PTREGS_INFO(r8), \
+	PTREGS_INFO(r9), \
+	PTREGS_INFO(r10), \
+	PTREGS_INFO(r11), \
+	PTREGS_INFO(r12), \
+	PTREGS_INFO(r13), \
+	PTREGS_INFO(r14), \
+	PTREGS_INFO(r15), \
+	PTREGS_INFO(rip)
+
+static inline void arch_unw_init_frame_info(struct unwind_frame_info *info,
+                                            /*const*/ struct pt_regs *regs)
+{
+	info->regs = *regs;
+}
+
+static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
+{
+	extern const char thread_return[];
+
+	memset(&info->regs, 0, sizeof(info->regs));
+	info->regs.rip = (unsigned long)thread_return;
+	info->regs.cs = __KERNEL_CS;
+	__get_user(info->regs.rbp, (unsigned long *)info->task->thread.rsp);
+	info->regs.rsp = info->task->thread.rsp;
+	info->regs.ss = __KERNEL_DS;
+}
+
+extern void arch_unwind_init_running(struct unwind_frame_info *,
+                                     void (*callback)(struct unwind_frame_info *,
+                                                      void *arg),
+                                     void *arg);
+
+static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
+{
+#if 0 /* This can only work when selector register saves/restores
+         are properly annotated (and tracked in UNW_REGISTER_INFO). */
+	return user_mode(&info->regs);
+#else
+	return (long)info->regs.rip >= 0
+	       || (info->regs.rip >= VSYSCALL_START && info->regs.rip < VSYSCALL_END)
+	       || (long)info->regs.rsp >= 0;
+#endif
+}
+
+#else
+
+#define UNW_PC(frame) ((void)(frame), 0)
+
+static inline int arch_unw_user_mode(const void *info)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _ASM_X86_64_UNWIND_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 256b3b805c5c..bbaed84a686b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -199,7 +199,7 @@ config UNWIND_INFO
 config STACK_UNWIND
 	bool "Stack unwind support"
 	depends on UNWIND_INFO
-	depends on n
+	depends on X86_64
 	help
 	  This enables more precise stack traces, omitting all unrelated
 	  occurrences of pointers into kernel code from the dump.
-- 
cgit v1.2.3


From 176a2718f408ce92788b29127050b04dfd6e4f68 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:41 +0200
Subject: [PATCH] i386: reliable stack trace support (i386)

These are the i386-specific pieces to enable reliable stack traces. This is
going to be even more useful once CFI annotations get added to he assembly
code, namely to entry.S.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       | 29 +++++++++++++
 arch/i386/kernel/process.c     |  2 +-
 arch/i386/kernel/traps.c       | 50 ++++++++++++++++-----
 arch/i386/kernel/vmlinux.lds.S |  9 ++++
 include/asm-i386/processor.h   |  2 +-
 include/asm-i386/unwind.h      | 98 ++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug              |  2 +-
 7 files changed, 179 insertions(+), 13 deletions(-)
 create mode 100644 include/asm-i386/unwind.h

(limited to 'include')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index cfc683f153b9..e802f3cac7e3 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -663,6 +663,35 @@ ENTRY(spurious_interrupt_bug)
 	pushl $do_spurious_interrupt_bug
 	jmp error_code
 
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	movl	4(%esp), %edx
+	movl	(%esp), %ecx
+	leal	4(%esp), %eax
+	movl	%ebx, EBX(%edx)
+	xorl	%ebx, %ebx
+	movl	%ebx, ECX(%edx)
+	movl	%ebx, EDX(%edx)
+	movl	%esi, ESI(%edx)
+	movl	%edi, EDI(%edx)
+	movl	%ebp, EBP(%edx)
+	movl	%ebx, EAX(%edx)
+	movl	$__USER_DS, DS(%edx)
+	movl	$__USER_DS, ES(%edx)
+	movl	%ebx, ORIG_EAX(%edx)
+	movl	%ecx, EIP(%edx)
+	movl	12(%esp), %ecx
+	movl	$__KERNEL_CS, CS(%edx)
+	movl	%ebx, EFLAGS(%edx)
+	movl	%eax, OLDESP(%edx)
+	movl	8(%esp), %eax
+	movl	%ecx, 8(%esp)
+	movl	EBX(%edx), %ebx
+	movl	$__KERNEL_DS, OLDSS(%edx)
+	jmpl	*%eax
+ENDPROC(arch_unwind_init_running)
+#endif
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 6259afea46d1..525432e3fef7 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs)
 	cr3 = read_cr3();
 	cr4 = read_cr4_safe();
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
-	show_trace(NULL, &regs->esp);
+	show_trace(NULL, regs, &regs->esp);
 }
 
 /*
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index dcc14477af1f..286584667865 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -28,6 +28,7 @@
 #include <linux/utsname.h>
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
+#include <linux/unwind.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -47,7 +48,7 @@
 #include <asm/desc.h>
 #include <asm/i387.h>
 #include <asm/nmi.h>
-
+#include <asm/unwind.h>
 #include <asm/smp.h>
 #include <asm/arch_hooks.h>
 #include <asm/kdebug.h>
@@ -170,14 +171,43 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 	return ebp;
 }
 
-static void show_trace_log_lvl(struct task_struct *task,
+static asmlinkage void show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+{
+	int printed = 0; /* nr of entries already printed on current line */
+
+	while (unwind(info) == 0 && UNW_PC(info)) {
+		printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed);
+		if (arch_unw_user_mode(info))
+			break;
+	}
+	if (printed)
+		printk("\n");
+}
+
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			       unsigned long *stack, char *log_lvl)
 {
 	unsigned long ebp;
+	struct unwind_frame_info info;
 
 	if (!task)
 		task = current;
 
+	if (regs) {
+		if (unwind_init_frame_info(&info, task, regs) == 0) {
+			show_trace_unwind(&info, log_lvl);
+			return;
+		}
+	} else if (task == current) {
+		if (unwind_init_running(&info, show_trace_unwind, log_lvl) == 0)
+			return;
+	} else {
+		if (unwind_init_blocked(&info, task) == 0) {
+			show_trace_unwind(&info, log_lvl);
+			return;
+		}
+	}
+
 	if (task == current) {
 		/* Grab ebp right from our regs */
 		asm ("movl %%ebp, %0" : "=r" (ebp) : );
@@ -198,13 +228,13 @@ static void show_trace_log_lvl(struct task_struct *task,
 	}
 }
 
-void show_trace(struct task_struct *task, unsigned long * stack)
+void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
 {
-	show_trace_log_lvl(task, stack, "");
+	show_trace_log_lvl(task, regs, stack, "");
 }
 
-static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
-			       char *log_lvl)
+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+			       unsigned long *esp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;
@@ -225,13 +255,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
-	show_trace_log_lvl(task, esp, log_lvl);
+	show_trace_log_lvl(task, regs, esp, log_lvl);
 }
 
 void show_stack(struct task_struct *task, unsigned long *esp)
 {
 	printk("       ");
-	show_stack_log_lvl(task, esp, "");
+	show_stack_log_lvl(task, NULL, esp, "");
 }
 
 /*
@@ -241,7 +271,7 @@ void dump_stack(void)
 {
 	unsigned long stack;
 
-	show_trace(current, &stack);
+	show_trace(current, NULL, &stack);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -285,7 +315,7 @@ void show_registers(struct pt_regs *regs)
 		u8 __user *eip;
 
 		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
+		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
 
 		printk(KERN_EMERG "Code: ");
 
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 7512f39c9f25..2d4f1386e2b1 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -71,6 +71,15 @@ SECTIONS
   .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
   _edata = .;			/* End of data section */
 
+#ifdef CONFIG_STACK_UNWIND
+  . = ALIGN(4);
+  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
+	__start_unwind = .;
+  	*(.eh_frame)
+	__end_unwind = .;
+  }
+#endif
+
   . = ALIGN(THREAD_SIZE);	/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 	*(.data.init_task)
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index b796210c0f5c..55ea992da329 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -555,7 +555,7 @@ extern void prepare_to_copy(struct task_struct *tsk);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
-void show_trace(struct task_struct *task, unsigned long *stack);
+void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
 
 unsigned long get_wchan(struct task_struct *p);
 
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
new file mode 100644
index 000000000000..1c076897ac21
--- /dev/null
+++ b/include/asm-i386/unwind.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_I386_UNWIND_H
+#define _ASM_I386_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ */
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <linux/sched.h>
+#include <asm/fixmap.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+struct unwind_frame_info
+{
+	struct pt_regs regs;
+	struct task_struct *task;
+};
+
+#define UNW_PC(frame)        (frame)->regs.eip
+#define UNW_SP(frame)        (frame)->regs.esp
+#ifdef CONFIG_FRAME_POINTER
+#define UNW_FP(frame)        (frame)->regs.ebp
+#define FRAME_RETADDR_OFFSET 4
+#define FRAME_LINK_OFFSET    0
+#define STACK_BOTTOM(tsk)    STACK_LIMIT((tsk)->thread.esp0)
+#define STACK_TOP(tsk)       ((tsk)->thread.esp0)
+#endif
+#define STACK_LIMIT(ptr)     (((ptr) - 1) & ~(THREAD_SIZE - 1))
+
+#define UNW_REGISTER_INFO \
+	PTREGS_INFO(eax), \
+	PTREGS_INFO(ecx), \
+	PTREGS_INFO(edx), \
+	PTREGS_INFO(ebx), \
+	PTREGS_INFO(esp), \
+	PTREGS_INFO(ebp), \
+	PTREGS_INFO(esi), \
+	PTREGS_INFO(edi), \
+	PTREGS_INFO(eip)
+
+static inline void arch_unw_init_frame_info(struct unwind_frame_info *info,
+                                            /*const*/ struct pt_regs *regs)
+{
+	if (user_mode_vm(regs))
+		info->regs = *regs;
+	else {
+		memcpy(&info->regs, regs, offsetof(struct pt_regs, esp));
+		info->regs.esp = (unsigned long)&regs->esp;
+		info->regs.xss = __KERNEL_DS;
+	}
+}
+
+static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
+{
+	memset(&info->regs, 0, sizeof(info->regs));
+	info->regs.eip = info->task->thread.eip;
+	info->regs.xcs = __KERNEL_CS;
+	__get_user(info->regs.ebp, (long *)info->task->thread.esp);
+	info->regs.esp = info->task->thread.esp;
+	info->regs.xss = __KERNEL_DS;
+	info->regs.xds = __USER_DS;
+	info->regs.xes = __USER_DS;
+}
+
+extern asmlinkage void arch_unwind_init_running(struct unwind_frame_info *,
+                                                asmlinkage void (*callback)(struct unwind_frame_info *,
+                                                                            void *arg),
+                                                void *arg);
+
+static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
+{
+#if 0 /* This can only work when selector register and EFLAGS saves/restores
+         are properly annotated (and tracked in UNW_REGISTER_INFO). */
+	return user_mode_vm(&info->regs);
+#else
+	return info->regs.eip < PAGE_OFFSET
+	       || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL)
+	            && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE)
+	       || info->regs.esp < PAGE_OFFSET;
+#endif
+}
+
+#else
+
+#define UNW_PC(frame) ((void)(frame), 0)
+
+static inline int arch_unw_user_mode(const void *info)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _ASM_I386_UNWIND_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index bbaed84a686b..8bab0102ac73 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -199,7 +199,7 @@ config UNWIND_INFO
 config STACK_UNWIND
 	bool "Stack unwind support"
 	depends on UNWIND_INFO
-	depends on X86_64
+	depends on X86
 	help
 	  This enables more precise stack traces, omitting all unrelated
 	  occurrences of pointers into kernel code from the dump.
-- 
cgit v1.2.3


From fe7cacc1c25e286872b878c5d46880b620cd1e2d Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:44 +0200
Subject: [PATCH] i386: reliable stack trace support i386 entry.S

To increase the usefulness of reliable stack unwinding, this adds CFI
unwind annotations to many low-level i386 routines.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S  | 234 +++++++++++++++++++++++++++++++++++++++++++---
 include/asm-i386/dwarf2.h |  54 +++++++++++
 2 files changed, 277 insertions(+), 11 deletions(-)
 create mode 100644 include/asm-i386/dwarf2.h

(limited to 'include')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e802f3cac7e3..e6e4506e749a 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -48,6 +48,7 @@
 #include <asm/smp.h>
 #include <asm/page.h>
 #include <asm/desc.h>
+#include <asm/dwarf2.h>
 #include "irq_vectors.h"
 
 #define nr_syscalls ((syscall_table_size)/4)
@@ -85,31 +86,67 @@ VM_MASK		= 0x00020000
 #define SAVE_ALL \
 	cld; \
 	pushl %es; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET es, 0;*/\
 	pushl %ds; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET ds, 0;*/\
 	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET eax, 0;\
 	pushl %ebp; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebp, 0;\
 	pushl %edi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edi, 0;\
 	pushl %esi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET esi, 0;\
 	pushl %edx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edx, 0;\
 	pushl %ecx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ecx, 0;\
 	pushl %ebx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebx, 0;\
 	movl $(__USER_DS), %edx; \
 	movl %edx, %ds; \
 	movl %edx, %es;
 
 #define RESTORE_INT_REGS \
 	popl %ebx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebx;\
 	popl %ecx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ecx;\
 	popl %edx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edx;\
 	popl %esi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE esi;\
 	popl %edi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edi;\
 	popl %ebp;	\
-	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebp;\
+	popl %eax;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE eax
 
 #define RESTORE_REGS	\
 	RESTORE_INT_REGS; \
 1:	popl %ds;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE ds;*/\
 2:	popl %es;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE es;*/\
 .section .fixup,"ax";	\
 3:	movl $0,(%esp);	\
 	jmp 1b;		\
@@ -122,13 +159,43 @@ VM_MASK		= 0x00020000
 	.long 2b,4b;	\
 .previous
 
+#define RING0_INT_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, 3*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, 4*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, OLDESP-EBX;\
+	/*CFI_OFFSET cs, CS-OLDESP;*/\
+	CFI_OFFSET eip, EIP-OLDESP;\
+	/*CFI_OFFSET es, ES-OLDESP;*/\
+	/*CFI_OFFSET ds, DS-OLDESP;*/\
+	CFI_OFFSET eax, EAX-OLDESP;\
+	CFI_OFFSET ebp, EBP-OLDESP;\
+	CFI_OFFSET edi, EDI-OLDESP;\
+	CFI_OFFSET esi, ESI-OLDESP;\
+	CFI_OFFSET edx, EDX-OLDESP;\
+	CFI_OFFSET ecx, ECX-OLDESP;\
+	CFI_OFFSET ebx, EBX-OLDESP
 
 ENTRY(ret_from_fork)
+	CFI_STARTPROC
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET -4
 	call schedule_tail
 	GET_THREAD_INFO(%ebp)
 	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
 	jmp syscall_exit
+	CFI_ENDPROC
 
 /*
  * Return to user mode is not as complex as all this looks,
@@ -139,6 +206,7 @@ ENTRY(ret_from_fork)
 
 	# userspace resumption stub bypassing syscall exit tracing
 	ALIGN
+	RING0_PTREGS_FRAME
 ret_from_exception:
 	preempt_stop
 ret_from_intr:
@@ -171,20 +239,33 @@ need_resched:
 	call preempt_schedule_irq
 	jmp need_resched
 #endif
+	CFI_ENDPROC
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
 	# sysenter call handler stub
 ENTRY(sysenter_entry)
+	CFI_STARTPROC simple
+	CFI_DEF_CFA esp, 0
+	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_esp0(%esp),%esp
 sysenter_past_esp:
 	sti
 	pushl $(__USER_DS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ss, 0*/
 	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esp, 0
 	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $(__USER_CS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET cs, 0*/
 	pushl $SYSENTER_RETURN
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eip, 0
 
 /*
  * Load the potential sixth argument from user stack.
@@ -199,6 +280,7 @@ sysenter_past_esp:
 .previous
 
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 
@@ -219,11 +301,14 @@ sysenter_past_esp:
 	xorl %ebp,%ebp
 	sti
 	sysexit
+	CFI_ENDPROC
 
 
 	# system call handler stub
 ENTRY(system_call)
+	RING0_INT_FRAME			# can't unwind into user space anyway
 	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 	testl $TF_MASK,EFLAGS(%esp)
@@ -256,10 +341,12 @@ restore_all:
 	movb CS(%esp), %al
 	andl $(VM_MASK | (4 << 8) | 3), %eax
 	cmpl $((4 << 8) | 3), %eax
+	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
 	RESTORE_REGS
 	addl $4, %esp
+	CFI_ADJUST_CFA_OFFSET -4
 1:	iret
 .section .fixup,"ax"
 iret_exc:
@@ -273,6 +360,7 @@ iret_exc:
 	.long 1b,iret_exc
 .previous
 
+	CFI_RESTORE_STATE
 ldt_ss:
 	larl OLDSS(%esp), %eax
 	jnz restore_nocheck
@@ -285,11 +373,13 @@ ldt_ss:
 	 * CPUs, which we can try to work around to make
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
+	CFI_ADJUST_CFA_OFFSET 8
 	cli
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
 	 * and a switch16 pointer on top of the current frame. */
 	call setup_x86_bogus_stack
+	CFI_ADJUST_CFA_OFFSET -8	# frame has moved
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
 1:	iret
@@ -297,9 +387,11 @@ ldt_ss:
 	.align 4
 	.long 1b,iret_exc
 .previous
+	CFI_ENDPROC
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
+	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
 	testb $_TIF_NEED_RESCHED, %cl
 	jz work_notifysig
@@ -329,8 +421,10 @@ work_notifysig:				# deal with pending signals and
 work_notifysig_v86:
 #ifdef CONFIG_VM86
 	pushl %ecx			# save ti_flags for do_notify_resume
+	CFI_ADJUST_CFA_OFFSET 4
 	call save_v86_state		# %eax contains pt_regs pointer
 	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
 	movl %eax, %esp
 	xorl %edx, %edx
 	call do_notify_resume
@@ -363,19 +457,21 @@ syscall_exit_work:
 	movl $1, %edx
 	call do_syscall_trace
 	jmp resume_userspace
+	CFI_ENDPROC
 
-	ALIGN
+	RING0_INT_FRAME			# can't unwind into user space anyway
 syscall_fault:
 	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 	movl $-EFAULT,EAX(%esp)
 	jmp resume_userspace
 
-	ALIGN
 syscall_badsys:
 	movl $-ENOSYS,EAX(%esp)
 	jmp resume_userspace
+	CFI_ENDPROC
 
 #define FIXUP_ESPFIX_STACK \
 	movl %esp, %eax; \
@@ -387,16 +483,21 @@ syscall_badsys:
 	movl %eax, %esp;
 #define UNWIND_ESPFIX_STACK \
 	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4; \
 	movl %ss, %eax; \
 	/* see if on 16bit stack */ \
 	cmpw $__ESPFIX_SS, %ax; \
-	jne 28f; \
-	movl $__KERNEL_DS, %edx; \
-	movl %edx, %ds; \
-	movl %edx, %es; \
+	je 28f; \
+27:	popl %eax; \
+	CFI_ADJUST_CFA_OFFSET -4; \
+.section .fixup,"ax"; \
+28:	movl $__KERNEL_DS, %eax; \
+	movl %eax, %ds; \
+	movl %eax, %es; \
 	/* switch to 32bit stack */ \
-	FIXUP_ESPFIX_STACK \
-28:	popl %eax;
+	FIXUP_ESPFIX_STACK; \
+	jmp 27b; \
+.previous
 
 /*
  * Build the entry stubs and pointer table with
@@ -408,9 +509,14 @@ ENTRY(interrupt)
 
 vector=0
 ENTRY(irq_entries_start)
+	RING0_INT_FRAME
 .rept NR_IRQS
 	ALIGN
+ .if vector
+	CFI_ADJUST_CFA_OFFSET -4
+ .endif
 1:	pushl $vector-256
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp common_interrupt
 .data
 	.long 1b
@@ -424,60 +530,99 @@ common_interrupt:
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
+	CFI_ENDPROC
 
 #define BUILD_INTERRUPT(name, nr)	\
 ENTRY(name)				\
+	RING0_INT_FRAME;		\
 	pushl $nr-256;			\
-	SAVE_ALL			\
+	CFI_ADJUST_CFA_OFFSET 4;	\
+	SAVE_ALL;			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
-	jmp ret_from_intr;
+	jmp ret_from_intr;	\
+	CFI_ENDPROC
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
 ENTRY(divide_error)
+	RING0_INT_FRAME
 	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_divide_error
+	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
 	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0*/
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
 	xorl %eax, %eax
 	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
 	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
 	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
 	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
 	decl %eax			# eax = -1
 	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
 	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
 	cld
 	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
 	UNWIND_ESPFIX_STACK
 	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_REGISTER es, ecx*/
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
 	movl %eax, ORIG_EAX(%esp)
 	movl %ecx, ES(%esp)
+	/*CFI_REL_OFFSET es, ES*/
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
 	movl %esp,%eax			# pt_regs pointer
 	call *%edi
 	jmp ret_from_exception
+	CFI_ENDPROC
 
 ENTRY(coprocessor_error)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(simd_coprocessor_error)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_simd_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(device_not_available)
+	RING0_INT_FRAME
 	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	movl %cr0, %eax
 	testl $0x4, %eax		# EM (math emulation bit)
@@ -487,9 +632,12 @@ ENTRY(device_not_available)
 	jmp ret_from_exception
 device_not_available_emulate:
 	pushl $0			# temporary storage for ORIG_EIP
+	CFI_ADJUST_CFA_OFFSET 4
 	call math_emulate
 	addl $4, %esp
+	CFI_ADJUST_CFA_OFFSET -4
 	jmp ret_from_exception
+	CFI_ENDPROC
 
 /*
  * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -514,16 +662,19 @@ label:						\
 	pushl $sysenter_past_esp
 
 KPROBE_ENTRY(debug)
+	RING0_INT_FRAME
 	cmpl $sysenter_entry,(%esp)
 	jne debug_stack_correct
 	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
 debug_stack_correct:
 	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	xorl %edx,%edx			# error code 0
 	movl %esp,%eax			# pt_regs pointer
 	call do_debug
 	jmp ret_from_exception
+	CFI_ENDPROC
 	.previous .text
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
@@ -534,14 +685,18 @@ debug_stack_correct:
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
+	RING0_INT_FRAME
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	movl %ss, %eax
 	cmpw $__ESPFIX_SS, %ax
 	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
 	je nmi_16bit_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	movl %esp,%eax
 	/* Do not access memory above the end of our stack page,
 	 * it might not exist.
@@ -549,16 +704,19 @@ ENTRY(nmi)
 	andl $(THREAD_SIZE-1),%eax
 	cmpl $(THREAD_SIZE-20),%eax
 	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
 	jae nmi_stack_correct
 	cmpl $sysenter_entry,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
 	jmp restore_all
+	CFI_ENDPROC
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
@@ -574,97 +732,150 @@ nmi_debug_stack_check:
 	jmp nmi_stack_correct
 
 nmi_16bit_stack:
+	RING0_INT_FRAME
 	/* create the pointer to lss back */
 	pushl %ss
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
+	CFI_ADJUST_CFA_OFFSET 4
 	movzwl %sp, %esp
 	addw $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
 	pushl 16(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
 	.endr
 	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	FIXUP_ESPFIX_STACK		# %eax == %esp
+	CFI_ADJUST_CFA_OFFSET -20	# the frame has now moved
 	xorl %edx,%edx			# zero error code
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to 16bit stack
 1:	iret
+	CFI_ENDPROC
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
 .previous
 
 KPROBE_ENTRY(int3)
+	RING0_INT_FRAME
 	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_int3
 	jmp ret_from_exception
+	CFI_ENDPROC
 	.previous .text
 
 ENTRY(overflow)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_overflow
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(bounds)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_bounds
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(invalid_op)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_invalid_op
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(coprocessor_segment_overrun)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_coprocessor_segment_overrun
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(invalid_TSS)
+	RING0_EC_FRAME
 	pushl $do_invalid_TSS
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(segment_not_present)
+	RING0_EC_FRAME
 	pushl $do_segment_not_present
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 ENTRY(stack_segment)
+	RING0_EC_FRAME
 	pushl $do_stack_segment
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 KPROBE_ENTRY(general_protection)
+	RING0_EC_FRAME
 	pushl $do_general_protection
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 	.previous .text
 
 ENTRY(alignment_check)
+	RING0_EC_FRAME
 	pushl $do_alignment_check
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
 	pushl $do_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl machine_check_vector
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 #endif
 
 ENTRY(spurious_interrupt_bug)
+	RING0_INT_FRAME
 	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
 	pushl $do_spurious_interrupt_bug
+	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
+	CFI_ENDPROC
 
 #ifdef CONFIG_STACK_UNWIND
 ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
 	movl	4(%esp), %edx
 	movl	(%esp), %ecx
 	leal	4(%esp), %eax
@@ -689,6 +900,7 @@ ENTRY(arch_unwind_init_running)
 	movl	EBX(%edx), %ebx
 	movl	$__KERNEL_DS, OLDSS(%edx)
 	jmpl	*%eax
+	CFI_ENDPROC
 ENDPROC(arch_unwind_init_running)
 #endif
 
diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h
new file mode 100644
index 000000000000..2280f6272f80
--- /dev/null
+++ b/include/asm-i386/dwarf2.h
@@ -0,0 +1,54 @@
+#ifndef _DWARF2_H
+#define _DWARF2_H
+
+#include <linux/config.h>
+
+#ifndef __ASSEMBLY__
+#warning "asm/dwarf2.h should be only included in pure assembly files"
+#endif
+
+/*
+   Macros for dwarf2 CFI unwind table entries.
+   See "as.info" for details on these pseudo ops. Unfortunately
+   they are only supported in very new binutils, so define them
+   away for older version.
+ */
+
+#ifdef CONFIG_UNWIND_INFO
+
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+
+#else
+
+/* Due to the structure of pre-exisiting code, don't use assembler line
+   comment character # to ignore the arguments. Instead, use a dummy macro. */
+.macro ignore a=0, b=0, c=0, d=0
+.endm
+
+#define CFI_STARTPROC	ignore
+#define CFI_ENDPROC	ignore
+#define CFI_DEF_CFA	ignore
+#define CFI_DEF_CFA_REGISTER	ignore
+#define CFI_DEF_CFA_OFFSET	ignore
+#define CFI_ADJUST_CFA_OFFSET	ignore
+#define CFI_OFFSET	ignore
+#define CFI_REL_OFFSET	ignore
+#define CFI_REGISTER	ignore
+#define CFI_RESTORE	ignore
+#define CFI_REMEMBER_STATE ignore
+#define CFI_RESTORE_STATE ignore
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From c33bd9aac0597eeedaaa01ea5aafe456894b2f2b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:47 +0200
Subject: [PATCH] i386/x86-64: fall back to old-style call trace if no
 unwinding

If no unwinding is possible at all for a certain exception instance,
fall back to the old style call trace instead of not showing any trace
at all.

Also, allow setting the stack trace mode at the command line.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c    | 46 ++++++++++++++++++++++++++++------------
 arch/x86_64/kernel/traps.c  | 51 +++++++++++++++++++++++++++++++--------------
 include/asm-i386/unwind.h   |  8 +++----
 include/asm-x86_64/unwind.h |  8 +++----
 include/linux/unwind.h      |  8 +++----
 kernel/unwind.c             |  7 +++----
 6 files changed, 83 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 286584667865..78464097470a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -93,6 +93,7 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
+static int call_trace = 1;
 ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
@@ -171,40 +172,47 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 	return ebp;
 }
 
-static asmlinkage void show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+static asmlinkage int show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
 {
+	int n = 0;
 	int printed = 0; /* nr of entries already printed on current line */
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
+		++n;
 		printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed);
 		if (arch_unw_user_mode(info))
 			break;
 	}
 	if (printed)
 		printk("\n");
+	return n;
 }
 
 static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			       unsigned long *stack, char *log_lvl)
 {
 	unsigned long ebp;
-	struct unwind_frame_info info;
 
 	if (!task)
 		task = current;
 
-	if (regs) {
-		if (unwind_init_frame_info(&info, task, regs) == 0) {
-			show_trace_unwind(&info, log_lvl);
-			return;
+	if (call_trace >= 0) {
+		int unw_ret = 0;
+		struct unwind_frame_info info;
+
+		if (regs) {
+			if (unwind_init_frame_info(&info, task, regs) == 0)
+				unw_ret = show_trace_unwind(&info, log_lvl);
+		} else if (task == current)
+			unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
+		else {
+			if (unwind_init_blocked(&info, task) == 0)
+				unw_ret = show_trace_unwind(&info, log_lvl);
 		}
-	} else if (task == current) {
-		if (unwind_init_running(&info, show_trace_unwind, log_lvl) == 0)
-			return;
-	} else {
-		if (unwind_init_blocked(&info, task) == 0) {
-			show_trace_unwind(&info, log_lvl);
-			return;
+		if (unw_ret > 0) {
+			if (call_trace > 0)
+				return;
+			printk("%sLegacy call trace:\n", log_lvl);
 		}
 	}
 
@@ -1245,3 +1253,15 @@ static int __init kstack_setup(char *s)
 	return 1;
 }
 __setup("kstack=", kstack_setup);
+
+static int __init call_trace_setup(char *s)
+{
+	if (strcmp(s, "old") == 0)
+		call_trace = -1;
+	else if (strcmp(s, "both") == 0)
+		call_trace = 0;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 1;
+	return 1;
+}
+__setup("call_trace=", call_trace_setup);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eb1534ff1f5f..bd0891f4c2c7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -107,6 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 }
 
 static int kstack_depth_to_print = 10;
+static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
 #include <linux/kallsyms.h> 
@@ -190,11 +191,12 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
-static void show_trace_unwind(struct unwind_frame_info *info, void *context)
+static int show_trace_unwind(struct unwind_frame_info *info, void *context)
 {
-	int i = 11;
+	int i = 11, n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
+		++n;
 		if (i > 50) {
 			printk("\n       ");
 			i = 7;
@@ -205,6 +207,7 @@ static void show_trace_unwind(struct unwind_frame_info *info, void *context)
 			break;
 	}
 	printk("\n");
+	return n;
 }
 
 /*
@@ -218,27 +221,32 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
-	int i;
+	int i = 11;
 	unsigned used = 0;
-	struct unwind_frame_info info;
 
 	printk("\nCall Trace:");
 
 	if (!tsk)
 		tsk = current;
 
-	if (regs) {
-		if (unwind_init_frame_info(&info, tsk, regs) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+	if (call_trace >= 0) {
+		int unw_ret = 0;
+		struct unwind_frame_info info;
+
+		if (regs) {
+			if (unwind_init_frame_info(&info, tsk, regs) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
+		} else if (tsk == current)
+			unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
+		else {
+			if (unwind_init_blocked(&info, tsk) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
 		}
-	} else if (tsk == current) {
-		if (unwind_init_running(&info, show_trace_unwind, NULL) == 0)
-			return;
-	} else {
-		if (unwind_init_blocked(&info, tsk) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+		if (unw_ret > 0) {
+			if (call_trace > 0)
+				return;
+			printk("Legacy call trace:");
+			i = 18;
 		}
 	}
 
@@ -264,7 +272,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		} \
 	} while (0)
 
-	for(i = 11; ; ) {
+	for(; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
@@ -1052,3 +1060,14 @@ static int __init kstack_setup(char *s)
 }
 __setup("kstack=", kstack_setup);
 
+static int __init call_trace_setup(char *s)
+{
+	if (strcmp(s, "old") == 0)
+		call_trace = -1;
+	else if (strcmp(s, "both") == 0)
+		call_trace = 0;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 1;
+	return 1;
+}
+__setup("call_trace=", call_trace_setup);
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index 1c076897ac21..d480f2e38215 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -66,10 +66,10 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
 	info->regs.xes = __USER_DS;
 }
 
-extern asmlinkage void arch_unwind_init_running(struct unwind_frame_info *,
-                                                asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                                            void *arg),
-                                                void *arg);
+extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,
+                                               asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                                          void *arg),
+                                               void *arg);
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
index 4f61de246179..f3e7124effe3 100644
--- a/include/asm-x86_64/unwind.h
+++ b/include/asm-x86_64/unwind.h
@@ -75,10 +75,10 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
 	info->regs.ss = __KERNEL_DS;
 }
 
-extern void arch_unwind_init_running(struct unwind_frame_info *,
-                                     void (*callback)(struct unwind_frame_info *,
-                                                      void *arg),
-                                     void *arg);
+extern int arch_unwind_init_running(struct unwind_frame_info *,
+                                    int (*callback)(struct unwind_frame_info *,
+                                                    void *arg),
+                                    void *arg);
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
diff --git a/include/linux/unwind.h b/include/linux/unwind.h
index 0295aa789ab4..13c7b2cd87ce 100644
--- a/include/linux/unwind.h
+++ b/include/linux/unwind.h
@@ -49,8 +49,8 @@ extern int unwind_init_blocked(struct unwind_frame_info *,
  * Prepare to unwind the currently running thread.
  */
 extern int unwind_init_running(struct unwind_frame_info *,
-                               asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                           void *arg),
+                               asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                          void *arg),
                                void *arg);
 
 /*
@@ -97,8 +97,8 @@ static inline int unwind_init_blocked(struct unwind_frame_info *info,
 }
 
 static inline int unwind_init_running(struct unwind_frame_info *info,
-                                      asmlinkage void (*cb)(struct unwind_frame_info *,
-                                                            void *arg),
+                                      asmlinkage int (*cb)(struct unwind_frame_info *,
+                                                           void *arg),
                                       void *arg)
 {
 	return -ENOSYS;
diff --git a/kernel/unwind.c b/kernel/unwind.c
index d36bcd3ad3b5..0421035272d9 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -885,14 +885,13 @@ EXPORT_SYMBOL(unwind_init_blocked);
  * Prepare to unwind the currently running thread.
  */
 int unwind_init_running(struct unwind_frame_info *info,
-                        asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                    void *arg),
+                        asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                   void *arg),
                         void *arg)
 {
 	info->task = current;
-	arch_unwind_init_running(info, callback, arg);
 
-	return 0;
+	return arch_unwind_init_running(info, callback, arg);
 }
 EXPORT_SYMBOL(unwind_init_running);
 
-- 
cgit v1.2.3


From 83f4fcce7fdd213bd570b899862c3838871f8cf7 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:50 +0200
Subject: [PATCH] x86_64: allow unwinder to build without module support

Add proper conditionals to be able to build with CONFIG_MODULES=n.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/unwind.h | 8 ++++++++
 kernel/unwind.c        | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/unwind.h b/include/linux/unwind.h
index 13c7b2cd87ce..ce48e2cd37a2 100644
--- a/include/linux/unwind.h
+++ b/include/linux/unwind.h
@@ -29,12 +29,16 @@ struct module;
  */
 extern void unwind_init(void);
 
+#ifdef CONFIG_MODULES
+
 extern void *unwind_add_table(struct module *,
                               const void *table_start,
                               unsigned long table_size);
 
 extern void unwind_remove_table(void *handle, int init_only);
 
+#endif
+
 extern int unwind_init_frame_info(struct unwind_frame_info *,
                                   struct task_struct *,
                                   /*const*/ struct pt_regs *);
@@ -72,6 +76,8 @@ struct unwind_frame_info {};
 
 static inline void unwind_init(void) {}
 
+#ifdef CONFIG_MODULES
+
 static inline void *unwind_add_table(struct module *mod,
                                      const void *table_start,
                                      unsigned long table_size)
@@ -79,6 +85,8 @@ static inline void *unwind_add_table(struct module *mod,
 	return NULL;
 }
 
+#endif
+
 static inline void unwind_remove_table(void *handle, int init_only)
 {
 }
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 0421035272d9..f69c804c8e62 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -172,6 +172,8 @@ void __init unwind_init(void)
 	                  __start_unwind, __end_unwind - __start_unwind);
 }
 
+#ifdef CONFIG_MODULES
+
 /* Must be called with module_mutex held. */
 void *unwind_add_table(struct module *module,
                        const void *table_start,
@@ -253,6 +255,8 @@ void unwind_remove_table(void *handle, int init_only)
 		kfree(table);
 }
 
+#endif /* CONFIG_MODULES */
+
 static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
 {
 	const u8 *cur = *pcur;
-- 
cgit v1.2.3


From d2ae5b5f6afd3c4caaf82444102a4372135cb994 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:56 +0200
Subject: [PATCH] x86_64: Get rid of pud_offset_k / __pud_offset_k

pud_offset_k() equivalent to pud_offset() now.  Pointed out by Jan Beulich
Similar for __pud_offset_ok, which needs a small change in the callers.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/fault.c       | 2 +-
 arch/x86_64/mm/init.c        | 2 +-
 include/asm-x86_64/pgtable.h | 6 ------
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 55250593d8c9..64728898f869 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -160,7 +160,7 @@ void dump_pagetable(unsigned long address)
 	printk("PGD %lx ", pgd_val(*pgd));
 	if (!pgd_present(*pgd)) goto ret; 
 
-	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+	pud = pud_offset(pgd, address);
 	if (bad_address(pud)) goto bad;
 	printk("PUD %lx ", pud_val(*pud));
 	if (!pud_present(*pud))	goto ret;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index a70a8e3e312d..b83645a2e02d 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -372,7 +372,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 		pud_t *pud;
 
 		if (after_bootmem)
-			pud = pud_offset_k(pgd, start & PGDIR_MASK);
+			pud = pud_offset(pgd, start & PGDIR_MASK);
 		else
 			pud = alloc_low_page(&map, &pud_phys);
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 31e83c3bd022..a31ab4e68a9b 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -337,14 +337,8 @@ static inline int pmd_large(pmd_t pte) {
 /* to find an entry in a page-table-directory. */
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 #define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
-#define pud_offset_k(pgd, addr) pud_offset(pgd, addr)
 #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
 
-static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
-{ 
-	return pud + pud_index(address);
-} 
-
 /* PMD  - Level 2 access */
 #define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-- 
cgit v1.2.3


From 8d4f6b93a4aaa6b56b600cd1165c971f4395e4b3 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:05 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - introduce iommu_detected

swiotlb relies on the gart specific iommu_aperture variable to know if
we discovered a hardware IOMMU before swiotlb initialization.  Introduce
iommu_detected to do the same thing, but in a HW IOMMU neutral manner,
in preparation for adding the Calgary HW IOMMU.

Signed-Off-By: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-Off-By: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c    | 1 +
 arch/x86_64/kernel/pci-dma.c     | 3 +++
 arch/x86_64/kernel/pci-gart.c    | 4 ++++
 arch/x86_64/kernel/pci-swiotlb.c | 2 +-
 include/asm-x86_64/proto.h       | 1 +
 5 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 9ea723bab16c..a195ef06ec55 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -209,6 +209,7 @@ void __init iommu_hole_init(void)
 		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;
 
+		iommu_detected = 1;
 		iommu_aperture = 1; 
 
 		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 33926c367250..7edd1a40fab3 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -33,6 +33,9 @@ int panic_on_overflow __read_mostly = 0;
 int force_iommu __read_mostly= 0;
 #endif
 
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
 /* Dummy device used for NULL arguments (normally ISA). Better would
    be probably a smaller DMA mask, but this is bug-to-bug compatible
    to i386. */
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 82a346e6e2e4..4f67957d2b42 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -597,6 +597,10 @@ static int __init pci_iommu_init(void)
 	if (swiotlb)
 		return -ENODEV;
 
+	/* Did we detect a different HW IOMMU? */
+	if (iommu_detected && !iommu_aperture)
+		return -1;
+
 	if (no_iommu ||
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
 	    !iommu_aperture ||
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index 990ed67896f2..ebdb77fe2057 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -31,7 +31,7 @@ struct dma_mapping_ops swiotlb_dma_ops = {
 void pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_aperture && !no_iommu &&
+	if (!iommu_detected && !no_iommu &&
 	    (end_pfn > MAX_DMA32_PFN || force_iommu))
 	       swiotlb = 1;
 	if (swiotlb) {
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 105476ff8039..9d3335b4e9b6 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -116,6 +116,7 @@ extern int skip_ioapic_setup;
 extern int acpi_ht;
 extern int acpi_disabled;
 
+extern int iommu_detected;
 #ifdef CONFIG_IOMMU
 extern int fallback_aper_order;
 extern int fallback_aper_force;
-- 
cgit v1.2.3


From a3c042a0f022dade8e02bf6c9be5d2379d7e133c Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:08 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - move valid_dma_direction into the
 callers

Based on Andi Kleen's comments on the original Calgary patch, move
valid_dma_direction into the calling functions.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/dma-mapping.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index 498f66df36b9..b6da83dcc7a6 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -55,6 +55,13 @@ extern dma_addr_t bad_dma_address;
 extern struct dma_mapping_ops* dma_ops;
 extern int iommu_merge;
 
+static inline int valid_dma_direction(int dma_direction)
+{
+	return ((dma_direction == DMA_BIDIRECTIONAL) ||
+		(dma_direction == DMA_TO_DEVICE) ||
+		(dma_direction == DMA_FROM_DEVICE));
+}
+
 static inline int dma_mapping_error(dma_addr_t dma_addr)
 {
 	if (dma_ops->mapping_error)
@@ -72,6 +79,7 @@ static inline dma_addr_t
 dma_map_single(struct device *hwdev, void *ptr, size_t size,
 	       int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	return dma_ops->map_single(hwdev, ptr, size, direction);
 }
 
@@ -79,6 +87,7 @@ static inline void
 dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
 		 int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	dma_ops->unmap_single(dev, addr, size, direction);
 }
 
@@ -91,6 +100,7 @@ static inline void
 dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			size_t size, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_single_for_cpu)
 		dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
 					     direction);
@@ -101,6 +111,7 @@ static inline void
 dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
 			   size_t size, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_single_for_device)
 		dma_ops->sync_single_for_device(hwdev, dma_handle, size,
 						direction);
@@ -111,6 +122,7 @@ static inline void
 dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			      unsigned long offset, size_t size, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_single_range_for_cpu) {
 		dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
 	}
@@ -122,6 +134,7 @@ static inline void
 dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
 				 unsigned long offset, size_t size, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_single_range_for_device)
 		dma_ops->sync_single_range_for_device(hwdev, dma_handle,
 						      offset, size, direction);
@@ -133,6 +146,7 @@ static inline void
 dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
 		    int nelems, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_sg_for_cpu)
 		dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
 	flush_write_buffers();
@@ -142,6 +156,7 @@ static inline void
 dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 		       int nelems, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	if (dma_ops->sync_sg_for_device) {
 		dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
 	}
@@ -152,6 +167,7 @@ dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 static inline int
 dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	return dma_ops->map_sg(hwdev, sg, nents, direction);
 }
 
@@ -159,6 +175,7 @@ static inline void
 dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
 	     int direction)
 {
+	BUG_ON(!valid_dma_direction(direction));
 	dma_ops->unmap_sg(hwdev, sg, nents, direction);
 }
 
-- 
cgit v1.2.3


From 0dc243ae10c8309c170a3af9f1adad1924a9f217 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:11 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - IOMMU abstractions

This patch creates a new interface for IOMMUs by adding a centralized
location for IOMMU allocation (for translation tables/apertures) and
IOMMU initialization.  In creating these, code was moved around for
abstraction, uniformity, and consiceness.

Take note of the move of the iommu_setup bootarg parsing code to
__setup.  This is enabled by moving back the location of the aperture
allocation/detection to mem init (which while ugly, was already the
location of the swiotlb_init).

While a slight departure from the previous patch, I belive this provides
the true intention of the previous versions of the patch which changed
this code.  It also makes the addition of the upcoming calgary code much
cleaner than previous patches.

[AK: Removed one broken change. iommu_setup still has to be called
early]

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-dma.c  | 29 +++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-gart.c | 13 ++++++-------
 arch/x86_64/kernel/setup.c    |  5 -----
 arch/x86_64/mm/init.c         |  7 +------
 include/asm-x86_64/pci.h      |  2 +-
 include/asm-x86_64/proto.h    | 10 +++++-----
 6 files changed, 42 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 7edd1a40fab3..a45844c7e3a3 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -279,3 +279,32 @@ __init int iommu_setup(char *p)
     }
     return 1;
 }
+__setup("iommu=", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_IOMMU
+	iommu_hole_init();
+#endif
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_IOMMU
+	gart_iommu_init();
+#endif
+
+	no_iommu_init();
+	return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 4f67957d2b42..9a93954bed37 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -571,7 +571,7 @@ static struct dma_mapping_ops gart_dma_ops = {
 	.unmap_sg = gart_unmap_sg,
 };
 
-static int __init pci_iommu_init(void)
+void __init gart_iommu_init(void)
 { 
 	struct agp_kern_info info;
 	unsigned long aper_size;
@@ -581,7 +581,7 @@ static int __init pci_iommu_init(void)
 
 	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
 		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
-		return -ENODEV;
+		return;
 	}
 
 #ifndef CONFIG_AGP_AMD64
@@ -595,11 +595,11 @@ static int __init pci_iommu_init(void)
 #endif	
 
 	if (swiotlb)
-		return -ENODEV;
+		return;
 
 	/* Did we detect a different HW IOMMU? */
 	if (iommu_detected && !iommu_aperture)
-		return -1;
+		return;
 
 	if (no_iommu ||
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
@@ -611,7 +611,7 @@ static int __init pci_iommu_init(void)
 					"but IOMMU not available.\n"
 			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
-		return -ENODEV;
+		return;
 	}
 
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
@@ -678,11 +678,10 @@ static int __init pci_iommu_init(void)
 
 	flush_gart();
 	dma_ops = &gart_dma_ops;
-	return 0;
 } 
 
 /* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+fs_initcall(gart_iommu_init);
 
 void gart_parse_options(char *p)
 {
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a9de8f02671f..04b2d7b92d17 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -63,7 +63,6 @@
 #include <asm/setup.h>
 #include <asm/mach_apic.h>
 #include <asm/numa.h>
-#include <asm/swiotlb.h>
 #include <asm/sections.h>
 #include <asm/dmi.h>
 
@@ -702,10 +701,6 @@ void __init setup_arch(char **cmdline_p)
 
 	e820_setup_gap();
 
-#ifdef CONFIG_IOMMU
-	iommu_hole_init();
-#endif
-
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b83645a2e02d..2f5f5b11e9d0 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -41,8 +41,6 @@
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
-#include <asm/dma-mapping.h>
-#include <asm/swiotlb.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
@@ -587,10 +585,7 @@ void __init mem_init(void)
 {
 	long codesize, reservedpages, datasize, initsize;
 
-#ifdef CONFIG_SWIOTLB
-	pci_swiotlb_init();
-#endif
-	no_iommu_init();
+	pci_iommu_alloc();
 
 	/* How many end-of-memory variables you have, grandma! */
 	max_low_pfn = end_pfn;
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 3374d34c4acd..4dbc07c54f7a 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -39,8 +39,8 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 #include <asm/scatterlist.h>
 #include <linux/string.h>
 #include <asm/page.h>
-#include <linux/dma-mapping.h> /* for have_iommu */
 
+extern void pci_iommu_alloc(void);
 extern int iommu_setup(char *opt);
 
 /* The PCI address space does equal the physical memory
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 9d3335b4e9b6..038fe1f47e6f 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -37,7 +37,6 @@ extern void ia32_sysenter_target(void);
 
 extern void config_acpi_tables(void);
 extern void ia32_syscall(void);
-extern void iommu_hole_init(void);
 
 extern int pmtimer_mark_offset(void);
 extern void pmtimer_resume(void);
@@ -101,13 +100,9 @@ extern int unsynchronized_tsc(void);
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
-extern void gart_parse_options(char *);
-extern void __init no_iommu_init(void);
-
 extern unsigned long table_start, table_end;
 
 extern int exception_trace;
-extern int force_iommu, no_iommu;
 extern int using_apic_timer;
 extern int disable_apic;
 extern unsigned cpu_khz;
@@ -116,8 +111,13 @@ extern int skip_ioapic_setup;
 extern int acpi_ht;
 extern int acpi_disabled;
 
+extern void no_iommu_init(void);
+extern int force_iommu, no_iommu;
 extern int iommu_detected;
 #ifdef CONFIG_IOMMU
+extern void gart_iommu_init(void);
+extern void gart_parse_options(char *);
+extern void iommu_hole_init(void);
 extern int fallback_aper_order;
 extern int fallback_aper_force;
 extern int iommu_aperture;
-- 
cgit v1.2.3


From e465058d55a88feb4c7ecabe63eea7ea7147e206 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:14 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - Calgary specific bits

This patch hooks Calgary into the build, the x86-64 IOMMU
initialization paths, and introduces the Calgary specific bits.  The
implementation draws inspiration from both PPC (which has support for
the same chip but requires firmware support which we don't have on
x86-64) and gart. Calgary is different from gart in that it support a
translation table per PHB, as opposed to the single gart aperture.

Changes from previous version:
 * Addition of boot-time disablement for bus-level translation/isolation
   (e.g, enable userspace DMA for things like X)
 * Usage of newer IOMMU abstraction functions

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/x86_64/boot-options.txt |   21 +
 arch/x86_64/Kconfig                   |   19 +
 arch/x86_64/kernel/Makefile           |    1 +
 arch/x86_64/kernel/pci-calgary.c      | 1018 +++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-dma.c          |    9 +
 arch/x86_64/kernel/tce.c              |  202 +++++++
 include/asm-x86_64/calgary.h          |   66 +++
 include/asm-x86_64/pci.h              |    2 +-
 include/asm-x86_64/tce.h              |   47 ++
 9 files changed, 1384 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86_64/kernel/pci-calgary.c
 create mode 100644 arch/x86_64/kernel/tce.c
 create mode 100644 include/asm-x86_64/calgary.h
 create mode 100644 include/asm-x86_64/tce.h

(limited to 'include')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index f2cd6ef53ff3..6887d44d2661 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -205,6 +205,27 @@ IOMMU
   pages  Prereserve that many 128K pages for the software IO bounce buffering.
   force  Force all IO through the software TLB.
 
+  calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
+  calgary=[translate_empty_slots]
+  calgary=[disable=<PCI bus number>]
+
+    64k,...,8M - Set the size of each PCI slot's translation table
+    when using the Calgary IOMMU. This is the size of the translation
+    table itself in main memory. The smallest table, 64k, covers an IO
+    space of 32MB; the largest, 8MB table, can cover an IO space of
+    4GB. Normally the kernel will make the right choice by itself.
+
+    translate_empty_slots - Enable translation even on slots that have
+    no devices attached to them, in case a device will be hotplugged
+    in the future.
+
+    disable=<PCI bus number> - Disable translation on a given PHB. For
+    example, the built-in graphics adapter resides on the first bridge
+    (PCI bus number 0); if translation (isolation) is enabled on this
+    bridge, X servers that access the hardware directly from user
+    space might stop working. Use this option if you have devices that
+    are accessed from userspace directly on some PCI host bridge.
+
 Debugging
 
   oops=panic Always panic on oopses. Default is to just kill the process,
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index e8c52a1eec06..ccc4a7fb97a3 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -405,6 +405,25 @@ config IOMMU
 	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
 	  too.
 
+config CALGARY_IOMMU
+	bool "IBM Calgary IOMMU support"
+	default y
+	select SWIOTLB
+	depends on PCI && EXPERIMENTAL
+	help
+	  Support for hardware IOMMUs in IBM's xSeries x366 and x460
+	  systems. Needed to run systems with more than 3GB of memory
+	  properly with 32-bit PCI devices that do not support DAC
+	  (Double Address Cycle). Calgary also supports bus level
+	  isolation, where all DMAs pass through the IOMMU.  This
+	  prevents them from going anywhere except their intended
+	  destination. This catches hard-to-find kernel bugs and
+	  mis-behaving drivers and devices that do not use the DMA-API
+	  properly to set up their DMA buffers.  The IOMMU can be
+	  turned off at boot time with the iommu=off parameter.
+	  Normally the kernel will make the right choice by itself.
+	  If unsure, say Y.
+
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index fd106bdddd6d..aeb9c560be88 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o
+obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary.o tce.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
new file mode 100644
index 000000000000..d91cb843f54d
--- /dev/null
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -0,0 +1,1018 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <asm/proto.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/system.h>
+#include <asm/dma.h>
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_VENDOR_DEVICE_ID_CALGARY \
+	(PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
+
+/* we need these for register space address calculation */
+#define START_ADDRESS           0xfe000000
+#define CHASSIS_BASE            0
+#define ONE_BASED_CHASSIS_NUM   1
+
+/* register offsets inside the host bridge space */
+#define PHB_CSR_OFFSET		0x0110
+#define PHB_PLSSR_OFFSET	0x0120
+#define PHB_CONFIG_RW_OFFSET	0x0160
+#define PHB_IOBASE_BAR_LOW	0x0170
+#define PHB_IOBASE_BAR_HIGH	0x0180
+#define PHB_MEM_1_LOW		0x0190
+#define PHB_MEM_1_HIGH		0x01A0
+#define PHB_IO_ADDR_SIZE	0x01B0
+#define PHB_MEM_1_SIZE		0x01C0
+#define PHB_MEM_ST_OFFSET	0x01D0
+#define PHB_AER_OFFSET		0x0200
+#define PHB_CONFIG_0_HIGH	0x0220
+#define PHB_CONFIG_0_LOW	0x0230
+#define PHB_CONFIG_0_END	0x0240
+#define PHB_MEM_2_LOW		0x02B0
+#define PHB_MEM_2_HIGH		0x02C0
+#define PHB_MEM_2_SIZE_HIGH	0x02D0
+#define PHB_MEM_2_SIZE_LOW	0x02E0
+#define PHB_DOSHOLE_OFFSET	0x08E0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE		0x20000000
+#define PHB_SLOT_DISABLE	0x1C000000
+#define PHB_DAC_DISABLE		0x01000000
+#define PHB_MEM2_ENABLE		0x00400000
+#define PHB_MCSR_ENABLE		0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS		0x0000ffffffff800fUL
+#define TAR_VALID		0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK		0xffe0ffff
+
+#define MAX_NUM_OF_PHBS		8 /* how many PHBs in total? */
+#define MAX_PHB_BUS_NUM		(MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */
+#define PHBS_PER_CALGARY	4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+	0x0580 /* TAR0 */,
+	0x0588 /* TAR1 */,
+	0x0590 /* TAR2 */,
+	0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+	0x4870 /* SPLIT QUEUE 0 */,
+	0x5870 /* SPLIT QUEUE 1 */,
+	0x6870 /* SPLIT QUEUE 2 */,
+	0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+	0x8000 /* PHB0 */,
+	0x9000 /* PHB1 */,
+	0xA000 /* PHB2 */,
+	0xB000 /* PHB3 */
+};
+
+void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES];
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+/*
+ * the bitmap of PHBs the user requested that we disable
+ * translation on.
+ */
+static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM);
+
+static void tce_cache_blast(struct iommu_table *tbl);
+
+/* enable this to stress test the chip's TCE cache */
+#ifdef CONFIG_IOMMU_DEBUG
+static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+{
+	tce_cache_blast(tbl);
+}
+#else
+static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+{
+}
+#endif /* BLAST_TCE_CACHE_ON_UNMAP */
+
+static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
+{
+	unsigned int npages;
+
+	npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
+	npages >>= PAGE_SHIFT;
+
+	return npages;
+}
+
+static inline int translate_phb(struct pci_dev* dev)
+{
+	int disabled = test_bit(dev->bus->number, translation_disabled);
+	return !disabled;
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+        unsigned long start_addr, unsigned int npages)
+{
+	unsigned long index;
+	unsigned long end;
+
+	index = start_addr >> PAGE_SHIFT;
+
+	/* bail out if we're asked to reserve a region we don't cover */
+	if (index >= tbl->it_size)
+		return;
+
+	end = index + npages;
+	if (end > tbl->it_size) /* don't go off the table */
+		end = tbl->it_size;
+
+	while (index < end) {
+		if (test_bit(index, tbl->it_map))
+			printk(KERN_ERR "Calgary: entry already allocated at "
+			       "0x%lx tbl %p dma 0x%lx npages %u\n",
+			       index, tbl, start_addr, npages);
+		++index;
+	}
+	set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages);
+}
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+	unsigned int npages)
+{
+	unsigned long offset;
+
+	BUG_ON(npages == 0);
+
+	offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
+				       tbl->it_size, npages);
+	if (offset == ~0UL) {
+		tce_cache_blast(tbl);
+		offset = find_next_zero_string(tbl->it_map, 0,
+					       tbl->it_size, npages);
+		if (offset == ~0UL) {
+			printk(KERN_WARNING "Calgary: IOMMU full.\n");
+			if (panic_on_overflow)
+				panic("Calgary: fix the allocator.\n");
+			else
+				return bad_dma_address;
+		}
+	}
+
+	set_bit_string(tbl->it_map, offset, npages);
+	tbl->it_hint = offset + npages;
+	BUG_ON(tbl->it_hint > tbl->it_size);
+
+	return offset;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
+	unsigned int npages, int direction)
+{
+	unsigned long entry, flags;
+	dma_addr_t ret = bad_dma_address;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	entry = iommu_range_alloc(tbl, npages);
+
+	if (unlikely(entry == bad_dma_address))
+		goto error;
+
+	/* set the return dma address */
+	ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+	/* put the TCEs in the HW table */
+	tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+		  direction);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+	return ret;
+
+error:
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+	printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+	       "iommu %p\n", npages, tbl);
+	return bad_dma_address;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+	unsigned int npages)
+{
+	unsigned long entry;
+	unsigned long i;
+
+	entry = dma_addr >> PAGE_SHIFT;
+
+	BUG_ON(entry + npages > tbl->it_size);
+
+	tce_free(tbl, entry, npages);
+
+	for (i = 0; i < npages; ++i) {
+		if (!test_bit(entry + i, tbl->it_map))
+			printk(KERN_ERR "Calgary: bit is off at 0x%lx "
+			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
+			       entry + i, tbl, dma_addr, entry, npages);
+	}
+
+	__clear_bit_string(tbl->it_map, entry, npages);
+
+	tce_cache_blast_stress(tbl);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+	unsigned int npages)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	__iommu_free(tbl, dma_addr, npages);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static void __calgary_unmap_sg(struct iommu_table *tbl,
+	struct scatterlist *sglist, int nelems, int direction)
+{
+	while (nelems--) {
+		unsigned int npages;
+		dma_addr_t dma = sglist->dma_address;
+		unsigned int dmalen = sglist->dma_length;
+
+		if (dmalen == 0)
+			break;
+
+		npages = num_dma_pages(dma, dmalen);
+		__iommu_free(tbl, dma, npages);
+		sglist++;
+	}
+}
+
+void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+		      int nelems, int direction)
+{
+	unsigned long flags;
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	__calgary_unmap_sg(tbl, sglist, nelems, direction);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static int calgary_nontranslate_map_sg(struct device* dev,
+	struct scatterlist *sg, int nelems, int direction)
+{
+	int i;
+
+ 	for (i = 0; i < nelems; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page);
+		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+		s->dma_length = s->length;
+	}
+	return nelems;
+}
+
+int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+	int nelems, int direction)
+{
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+	unsigned long flags;
+	unsigned long vaddr;
+	unsigned int npages;
+	unsigned long entry;
+	int i;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	for (i = 0; i < nelems; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page);
+
+		vaddr = (unsigned long)page_address(s->page) + s->offset;
+		npages = num_dma_pages(vaddr, s->length);
+
+		entry = iommu_range_alloc(tbl, npages);
+		if (entry == bad_dma_address) {
+			/* makes sure unmap knows to stop */
+			s->dma_length = 0;
+			goto error;
+		}
+
+		s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+		/* insert into HW table */
+		tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
+			  direction);
+
+		s->dma_length = s->length;
+	}
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+	return nelems;
+error:
+	__calgary_unmap_sg(tbl, sg, nelems, direction);
+	for (i = 0; i < nelems; i++) {
+		sg[i].dma_address = bad_dma_address;
+		sg[i].dma_length = 0;
+	}
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+	return 0;
+}
+
+dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+	size_t size, int direction)
+{
+	dma_addr_t dma_handle = bad_dma_address;
+	unsigned long uaddr;
+	unsigned int npages;
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	uaddr = (unsigned long)vaddr;
+	npages = num_dma_pages(uaddr, size);
+
+	if (translate_phb(to_pci_dev(dev)))
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+	else
+		dma_handle = virt_to_bus(vaddr);
+
+	return dma_handle;
+}
+
+void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+	size_t size, int direction)
+{
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+	unsigned int npages;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return;
+
+	npages = num_dma_pages(dma_handle, size);
+	iommu_free(tbl, dma_handle, npages);
+}
+
+void* calgary_alloc_coherent(struct device *dev, size_t size,
+	dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int npages, order;
+	struct iommu_table *tbl;
+
+	tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	size = PAGE_ALIGN(size); /* size rounded up to full pages */
+	npages = size >> PAGE_SHIFT;
+	order = get_order(size);
+
+	/* alloc enough pages (and possibly more) */
+	ret = (void *)__get_free_pages(flag, order);
+	if (!ret)
+		goto error;
+	memset(ret, 0, size);
+
+	if (translate_phb(to_pci_dev(dev))) {
+		/* set up tces to cover the allocated range */
+		mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+		if (mapping == bad_dma_address)
+			goto free;
+
+		*dma_handle = mapping;
+	} else /* non translated slot */
+		*dma_handle = virt_to_bus(ret);
+
+	return ret;
+
+free:
+	free_pages((unsigned long)ret, get_order(size));
+	ret = NULL;
+error:
+	return ret;
+}
+
+static struct dma_mapping_ops calgary_dma_ops = {
+	.alloc_coherent = calgary_alloc_coherent,
+	.map_single = calgary_map_single,
+	.unmap_single = calgary_unmap_single,
+	.map_sg = calgary_map_sg,
+	.unmap_sg = calgary_unmap_sg,
+};
+
+static inline int busno_to_phbid(unsigned char num)
+{
+	return bus_to_phb(num) % PHBS_PER_CALGARY;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+	unsigned long target = ((unsigned long)bar) | offset;
+	return (void __iomem*)target;
+}
+
+static void tce_cache_blast(struct iommu_table *tbl)
+{
+	u64 val;
+	u32 aer;
+	int i = 0;
+	void __iomem *bbar = tbl->bbar;
+	void __iomem *target;
+
+	/* disable arbitration on the bus */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+	aer = readl(target);
+	writel(0, target);
+
+	/* read plssr to ensure it got there */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+	val = readl(target);
+
+	/* poll split queues until all DMA activity is done */
+	target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+	do {
+		val = readq(target);
+		i++;
+	} while ((val & 0xff) != 0xff && i < 100);
+	if (i == 100)
+		printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
+		       "continuing anyway\n");
+
+	/* invalidate TCE cache */
+	target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+	writeq(tbl->tar_val, target);
+
+	/* enable arbitration */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+	writel(aer, target);
+	(void)readl(target); /* flush */
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+	u64 limit)
+{
+	unsigned int numpages;
+
+	limit = limit | 0xfffff;
+	limit++;
+
+	numpages = ((limit - start) >> PAGE_SHIFT);
+	iommu_range_reserve(dev->sysdata, start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+	void __iomem *target;
+	u64 low, high, sizelow;
+	u64 start, limit;
+	struct iommu_table *tbl = dev->sysdata;
+	unsigned char busnum = dev->bus->number;
+	void __iomem *bbar = tbl->bbar;
+
+	/* peripheral MEM_1 region */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+	low = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+	high = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+	sizelow = be32_to_cpu(readl(target));
+
+	start = (high << 32) | low;
+	limit = sizelow;
+
+	calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+	void __iomem *target;
+	u32 val32;
+	u64 low, high, sizelow, sizehigh;
+	u64 start, limit;
+	struct iommu_table *tbl = dev->sysdata;
+	unsigned char busnum = dev->bus->number;
+	void __iomem *bbar = tbl->bbar;
+
+	/* is it enabled? */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	if (!(val32 & PHB_MEM2_ENABLE))
+		return;
+
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+	low = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+	high = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+	sizelow = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+	sizehigh = be32_to_cpu(readl(target));
+
+	start = (high << 32) | low;
+	limit = (sizehigh << 32) | sizelow;
+
+	calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+	unsigned int npages;
+	void __iomem *bbar;
+	unsigned char busnum;
+	u64 start;
+	struct iommu_table *tbl = dev->sysdata;
+
+	bbar = tbl->bbar;
+	busnum = dev->bus->number;
+
+	/* reserve bad_dma_address in case it's a legal address */
+	iommu_range_reserve(tbl, bad_dma_address, 1);
+
+	/* avoid the BIOS/VGA first 640KB-1MB region */
+	start = (640 * 1024);
+	npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+	iommu_range_reserve(tbl, start, npages);
+
+	/* reserve the two PCI peripheral memory regions in IO space */
+	calgary_reserve_peripheral_mem_1(dev);
+	calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+	u64 val64;
+	u64 table_phys;
+	void __iomem *target;
+	int ret;
+	struct iommu_table *tbl;
+
+	/* build TCE tables for each PHB */
+	ret = build_tce_table(dev, bbar);
+	if (ret)
+		return ret;
+
+	calgary_reserve_regions(dev);
+
+	/* set TARs for each PHB */
+	target = calgary_reg(bbar, tar_offset(dev->bus->number));
+	val64 = be64_to_cpu(readq(target));
+
+	/* zero out all TAR bits under sw control */
+	val64 &= ~TAR_SW_BITS;
+
+	tbl = dev->sysdata;
+	table_phys = (u64)__pa(tbl->it_base);
+	val64 |= table_phys;
+
+	BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+	val64 |= (u64) specified_table_size;
+
+	tbl->tar_val = cpu_to_be64(val64);
+	writeq(tbl->tar_val, target);
+	readq(target); /* flush */
+
+	return 0;
+}
+
+static void __init calgary_free_tar(struct pci_dev *dev)
+{
+	u64 val64;
+	struct iommu_table *tbl = dev->sysdata;
+	void __iomem *target;
+
+	target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+	val64 = be64_to_cpu(readq(target));
+	val64 &= ~TAR_SW_BITS;
+	writeq(cpu_to_be64(val64), target);
+	readq(target); /* flush */
+
+	kfree(tbl);
+	dev->sysdata = NULL;
+}
+
+static void calgary_watchdog(unsigned long data)
+{
+	struct pci_dev *dev = (struct pci_dev *)data;
+	struct iommu_table *tbl = dev->sysdata;
+	void __iomem *bbar = tbl->bbar;
+	u32 val32;
+	void __iomem *target;
+
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+
+	/* If no error, the agent ID in the CSR is not valid */
+	if (val32 & CSR_AGENT_MASK) {
+		printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, "
+				  "CSR = %#x\n", dev->bus->number, val32);
+		writel(0, target);
+
+		/* Disable bus that caused the error */
+		target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+					   PHB_CONFIG_RW_OFFSET);
+		val32 = be32_to_cpu(readl(target));
+		val32 |= PHB_SLOT_DISABLE;
+		writel(cpu_to_be32(val32), target);
+		readl(target); /* flush */
+	} else {
+		/* Reset the timer */
+		mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+	}
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+	u32 val32;
+	unsigned char busnum;
+	void __iomem *target;
+	void __iomem *bbar;
+	struct iommu_table *tbl;
+
+	busnum = dev->bus->number;
+	tbl = dev->sysdata;
+	bbar = tbl->bbar;
+
+	/* enable TCE in PHB Config Register */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+	printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum);
+	printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+	       "bus.\n");
+
+	writel(cpu_to_be32(val32), target);
+	readl(target); /* flush */
+
+	init_timer(&tbl->watchdog_timer);
+	tbl->watchdog_timer.function = &calgary_watchdog;
+	tbl->watchdog_timer.data = (unsigned long)dev;
+	mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+	u32 val32;
+	unsigned char busnum;
+	void __iomem *target;
+	void __iomem *bbar;
+	struct iommu_table *tbl;
+
+	busnum = dev->bus->number;
+	tbl = dev->sysdata;
+	bbar = tbl->bbar;
+
+	/* disable TCE in PHB Config Register */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+	printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum);
+	writel(cpu_to_be32(val32), target);
+	readl(target); /* flush */
+
+	del_timer_sync(&tbl->watchdog_timer);
+}
+
+static inline unsigned int __init locate_register_space(struct pci_dev *dev)
+{
+	int rionodeid;
+	u32 address;
+
+	rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2;
+	/*
+	 * register space address calculation as follows:
+	 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
+	 * ChassisBase is always zero for x366/x260/x460
+	 * RioNodeId is 2 for first Calgary, 3 for second Calgary
+	 */
+	address = START_ADDRESS	-
+		(0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) +
+		(0x100000) * (rionodeid - CHASSIS_BASE);
+	return address;
+}
+
+static int __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+	dev->sysdata = NULL;
+	dev->bus->self = dev;
+
+	return 0;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+	u32 address;
+	void __iomem *bbar;
+	int ret;
+
+	address = locate_register_space(dev);
+	/* map entire 1MB of Calgary config space */
+	bbar = ioremap_nocache(address, 1024 * 1024);
+	if (!bbar) {
+		ret = -ENODATA;
+		goto done;
+	}
+
+	ret = calgary_setup_tar(dev, bbar);
+	if (ret)
+		goto iounmap;
+
+	dev->bus->self = dev;
+	calgary_enable_translation(dev);
+
+	return 0;
+
+iounmap:
+	iounmap(bbar);
+done:
+	return ret;
+}
+
+static int __init calgary_init(void)
+{
+	int i, ret = -ENODEV;
+	struct pci_dev *dev = NULL;
+
+	for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
+		dev = pci_get_device(PCI_VENDOR_ID_IBM,
+				     PCI_DEVICE_ID_IBM_CALGARY,
+				     dev);
+		if (!dev)
+			break;
+		if (!translate_phb(dev)) {
+			calgary_init_one_nontraslated(dev);
+			continue;
+		}
+		if (!tce_table_kva[i] && !translate_empty_slots) {
+			pci_dev_put(dev);
+			continue;
+		}
+		ret = calgary_init_one(dev);
+		if (ret)
+			goto error;
+	}
+
+	return ret;
+
+error:
+	for (i--; i >= 0; i--) {
+		dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
+					      PCI_DEVICE_ID_IBM_CALGARY,
+					      dev);
+		if (!translate_phb(dev)) {
+			pci_dev_put(dev);
+			continue;
+		}
+		if (!tce_table_kva[i] && !translate_empty_slots)
+			continue;
+		calgary_disable_translation(dev);
+		calgary_free_tar(dev);
+		pci_dev_put(dev);
+	}
+
+	return ret;
+}
+
+static inline int __init determine_tce_table_size(u64 ram)
+{
+	int ret;
+
+	if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+		return specified_table_size;
+
+	/*
+	 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+	 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+	 * larger table size has twice as many entries, so shift the
+	 * max ram address by 13 to divide by 8K and then look at the
+	 * order of the result to choose between 0-7.
+	 */
+	ret = get_order(ram >> 13);
+	if (ret > TCE_TABLE_SIZE_8M)
+		ret = TCE_TABLE_SIZE_8M;
+
+	return ret;
+}
+
+void __init detect_calgary(void)
+{
+	u32 val;
+	int bus, table_idx;
+	void *tbl;
+	int detected = 0;
+
+	/*
+	 * if the user specified iommu=off or iommu=soft or we found
+	 * another HW IOMMU already, bail out.
+	 */
+	if (swiotlb || no_iommu || iommu_detected)
+		return;
+
+	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+
+	for (bus = 0, table_idx = 0;
+	     bus <= num_online_nodes() * MAX_PHB_BUS_NUM;
+	     bus++) {
+		BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM);
+		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
+			continue;
+		if (test_bit(bus, translation_disabled)) {
+			printk(KERN_INFO "Calgary: translation is disabled for "
+			       "PHB 0x%x\n", bus);
+			/* skip this phb, don't allocate a tbl for it */
+			tce_table_kva[table_idx] = NULL;
+			table_idx++;
+			continue;
+		}
+		/*
+		 * scan the first slot of the PCI bus to see if there
+		 * are any devices present
+		 */
+		val = read_pci_config(bus, 1, 0, 0);
+		if (val != 0xffffffff || translate_empty_slots) {
+			tbl = alloc_tce_table();
+			if (!tbl)
+				goto cleanup;
+			detected = 1;
+		} else
+			tbl = NULL;
+
+		tce_table_kva[table_idx] = tbl;
+		table_idx++;
+	}
+
+	if (detected) {
+		iommu_detected = 1;
+		calgary_detected = 1;
+		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
+		       "TCE table spec is %d.\n", specified_table_size);
+	}
+	return;
+
+cleanup:
+	for (--table_idx; table_idx >= 0; --table_idx)
+		if (tce_table_kva[table_idx])
+			free_tce_table(tce_table_kva[table_idx]);
+}
+
+int __init calgary_iommu_init(void)
+{
+	int ret;
+
+	if (no_iommu || swiotlb)
+		return -ENODEV;
+
+	if (!calgary_detected)
+		return -ENODEV;
+
+	/* ok, we're trying to use Calgary - let's roll */
+	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+	ret = calgary_init();
+	if (ret) {
+		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+		       "falling back to no_iommu\n", ret);
+		if (end_pfn > MAX_DMA32_PFN)
+			printk(KERN_ERR "WARNING more than 4GB of memory, "
+					"32bit PCI may malfunction.\n");
+		return ret;
+	}
+
+	force_iommu = 1;
+	dma_ops = &calgary_dma_ops;
+
+	return 0;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+	unsigned int bridge;
+	size_t len;
+	char* endp;
+
+	while (*p) {
+		if (!strncmp(p, "64k", 3))
+			specified_table_size = TCE_TABLE_SIZE_64K;
+		else if (!strncmp(p, "128k", 4))
+			specified_table_size = TCE_TABLE_SIZE_128K;
+		else if (!strncmp(p, "256k", 4))
+			specified_table_size = TCE_TABLE_SIZE_256K;
+		else if (!strncmp(p, "512k", 4))
+			specified_table_size = TCE_TABLE_SIZE_512K;
+		else if (!strncmp(p, "1M", 2))
+			specified_table_size = TCE_TABLE_SIZE_1M;
+		else if (!strncmp(p, "2M", 2))
+			specified_table_size = TCE_TABLE_SIZE_2M;
+		else if (!strncmp(p, "4M", 2))
+			specified_table_size = TCE_TABLE_SIZE_4M;
+		else if (!strncmp(p, "8M", 2))
+			specified_table_size = TCE_TABLE_SIZE_8M;
+
+		len = strlen("translate_empty_slots");
+		if (!strncmp(p, "translate_empty_slots", len))
+			translate_empty_slots = 1;
+
+		len = strlen("disable");
+		if (!strncmp(p, "disable", len)) {
+			p += len;
+			if (*p == '=')
+				++p;
+			if (*p == '\0')
+				break;
+			bridge = simple_strtol(p, &endp, 0);
+			if (p == endp)
+				break;
+
+			if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) {
+				printk(KERN_INFO "Calgary: disabling "
+				       "translation for PHB 0x%x\n", bridge);
+				set_bit(bridge, translation_disabled);
+			}
+		}
+
+		p = strpbrk(p, ",");
+		if (!p)
+			break;
+
+		p++; /* skip ',' */
+	}
+	return 1;
+}
+__setup("calgary=", calgary_parse_options);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index a45844c7e3a3..9c44f4f2433d 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <asm/io.h>
 #include <asm/proto.h>
+#include <asm/calgary.h>
 
 int iommu_merge __read_mostly = 0;
 EXPORT_SYMBOL(iommu_merge);
@@ -291,6 +292,10 @@ void __init pci_iommu_alloc(void)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif
+
 #ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
 #endif
@@ -298,6 +303,10 @@ void __init pci_iommu_alloc(void)
 
 static int __init pci_iommu_init(void)
 {
+#ifdef CONFIG_CALGARY_IOMMU
+	calgary_iommu_init();
+#endif
+
 #ifdef CONFIG_IOMMU
 	gart_iommu_init();
 #endif
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
new file mode 100644
index 000000000000..8d4c67f61b8e
--- /dev/null
+++ b/arch/x86_64/kernel/tce.c
@@ -0,0 +1,202 @@
+/*
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+	/* a single tce can't cross a cache line */
+	if (cpu_has_clflush)
+		asm volatile("clflush (%0)" :: "r" (tceaddr));
+	else
+		asm volatile("wbinvd":::"memory");
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+	unsigned int npages, unsigned long uaddr, int direction)
+{
+	u64* tp;
+	u64 t;
+	u64 rpn;
+
+	t = (1 << TCE_READ_SHIFT);
+	if (direction != DMA_TO_DEVICE)
+		t |= (1 << TCE_WRITE_SHIFT);
+
+	tp = ((u64*)tbl->it_base) + index;
+
+	while (npages--) {
+		rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+		t &= ~TCE_RPN_MASK;
+		t |= (rpn << TCE_RPN_SHIFT);
+
+		*tp = cpu_to_be64(t);
+		flush_tce(tp);
+
+		uaddr += PAGE_SIZE;
+		tp++;
+	}
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+	u64* tp;
+
+	tp  = ((u64*)tbl->it_base) + index;
+
+	while (npages--) {
+		*tp = cpu_to_be64(0);
+		flush_tce(tp);
+		tp++;
+	}
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+	/*
+	 * size is the order of the table, 0-7
+	 * smallest table is 8K entries, so shift result by 13 to
+	 * multiply by 8K
+	 */
+	return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+	unsigned int bitmapsz;
+	unsigned int tce_table_index;
+	unsigned long bmppages;
+	int ret;
+
+	tbl->it_busno = dev->bus->number;
+
+	/* set the tce table size - measured in entries */
+	tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+	tce_table_index = bus_to_phb(tbl->it_busno);
+	tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
+	if (!tbl->it_base) {
+		printk(KERN_ERR "Calgary: iommu_table_setparms: "
+		       "no table allocated?!\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * number of bytes needed for the bitmap size in number of
+	 * entries; we need one bit per entry
+	 */
+	bitmapsz = tbl->it_size / BITS_PER_BYTE;
+	bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+	if (!bmppages) {
+		printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	tbl->it_map = (unsigned long*)bmppages;
+
+	memset(tbl->it_map, 0, bitmapsz);
+
+	tbl->it_hint = 0;
+
+	spin_lock_init(&tbl->it_lock);
+
+	return 0;
+
+done:
+	return ret;
+}
+
+int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+	struct iommu_table *tbl;
+	int ret;
+
+	if (dev->sysdata) {
+		printk(KERN_ERR "Calgary: dev %p has sysdata %p\n",
+		       dev, dev->sysdata);
+		BUG();
+	}
+
+	tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+	if (!tbl) {
+		printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	ret = tce_table_setparms(dev, tbl);
+	if (ret)
+		goto free_tbl;
+
+	tce_free(tbl, 0, tbl->it_size);
+
+	tbl->bbar = bbar;
+
+	/*
+	 * NUMA is already using the bus's sysdata pointer, so we use
+	 * the bus's pci_dev's sysdata instead.
+	 */
+	dev->sysdata = tbl;
+
+	return 0;
+
+free_tbl:
+	kfree(tbl);
+done:
+	return ret;
+}
+
+void* alloc_tce_table(void)
+{
+	unsigned int size;
+
+	size = table_size_to_number_of_entries(specified_table_size);
+	size *= TCE_ENTRY_SIZE;
+
+	return __alloc_bootmem_low(size, size, 0);
+}
+
+void free_tce_table(void *tbl)
+{
+	unsigned int size;
+
+	if (!tbl)
+		return;
+
+	size = table_size_to_number_of_entries(specified_table_size);
+	size *= TCE_ENTRY_SIZE;
+
+	free_bootmem(__pa(tbl), size);
+}
diff --git a/include/asm-x86_64/calgary.h b/include/asm-x86_64/calgary.h
new file mode 100644
index 000000000000..6e1654f30986
--- /dev/null
+++ b/include/asm-x86_64/calgary.h
@@ -0,0 +1,66 @@
+/*
+ * Derived from include/asm-powerpc/iommu.h
+ *
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_CALGARY_H
+#define _ASM_X86_64_CALGARY_H
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <asm/types.h>
+
+struct iommu_table {
+	unsigned long  it_base;      /* mapped address of tce table */
+	unsigned long  it_hint;      /* Hint for next alloc */
+	unsigned long *it_map;       /* A simple allocation bitmap for now */
+	spinlock_t     it_lock;      /* Protects it_map */
+	unsigned int   it_size;      /* Size of iommu table in entries */
+	unsigned char  it_busno;     /* Bus number this table belongs to */
+	void __iomem  *bbar;
+	u64	       tar_val;
+	struct timer_list watchdog_timer;
+};
+
+#define TCE_TABLE_SIZE_UNSPECIFIED	~0
+#define TCE_TABLE_SIZE_64K		0
+#define TCE_TABLE_SIZE_128K		1
+#define TCE_TABLE_SIZE_256K		2
+#define TCE_TABLE_SIZE_512K		3
+#define TCE_TABLE_SIZE_1M		4
+#define TCE_TABLE_SIZE_2M		5
+#define TCE_TABLE_SIZE_4M		6
+#define TCE_TABLE_SIZE_8M		7
+
+#ifdef CONFIG_CALGARY_IOMMU
+extern int calgary_iommu_init(void);
+extern void detect_calgary(void);
+#else
+static inline int calgary_iommu_init(void) { return 1; }
+static inline void detect_calgary(void) { return; }
+#endif
+
+static inline unsigned int bus_to_phb(unsigned char busno)
+{
+	return ((busno % 15 == 0) ? 0 : busno / 2 + 1);
+}
+
+#endif /* _ASM_X86_64_CALGARY_H */
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 4dbc07c54f7a..49c5e9280598 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -52,7 +52,7 @@ extern int iommu_setup(char *opt);
  */
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
 
-#ifdef CONFIG_IOMMU
+#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
 
 /*
  * x86-64 always supports DAC, but sometimes it is useful to force
diff --git a/include/asm-x86_64/tce.h b/include/asm-x86_64/tce.h
new file mode 100644
index 000000000000..ee51d31528d6
--- /dev/null
+++ b/include/asm-x86_64/tce.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ *
+ * This file is derived from asm-powerpc/tce.h.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_TCE_H
+#define _ASM_X86_64_TCE_H
+
+extern void* tce_table_kva[];
+extern unsigned int specified_table_size;
+struct iommu_table;
+
+#define TCE_ENTRY_SIZE   8   /* in bytes */
+
+#define TCE_READ_SHIFT   0
+#define TCE_WRITE_SHIFT  1
+#define TCE_HUBID_SHIFT  2   /* unused */
+#define TCE_RSVD_SHIFT   8   /* unused */
+#define TCE_RPN_SHIFT    12
+#define TCE_UNUSED_SHIFT 48  /* unused */
+
+#define TCE_RPN_MASK     0x0000fffffffff000ULL
+
+extern void tce_build(struct iommu_table *tbl, unsigned long index,
+        unsigned int npages, unsigned long uaddr, int direction);
+extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
+extern void* alloc_tce_table(void);
+extern void free_tce_table(void *tbl);
+extern int build_tce_table(struct pci_dev *dev, void __iomem *bbar);
+
+#endif /* _ASM_X86_64_TCE_H */
-- 
cgit v1.2.3


From f3fa8ebc25129bb69929e20b0c84049c39029d8d Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohitseth@google.com>
Date: Mon, 26 Jun 2006 13:58:17 +0200
Subject: [PATCH] x86_64: moving phys_proc_id and cpu_core_id to cpuinfo_x86

Most of the fields of cpuinfo are defined in cpuinfo_x86 structure.
This patch moves the phys_proc_id and cpu_core_id for each processor to
cpuinfo_x86 structure as well.

Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c   | 10 +++++-----
 arch/x86_64/kernel/pci-gart.c  |  3 ---
 arch/x86_64/kernel/setup.c     | 27 +++++++++++++--------------
 arch/x86_64/kernel/smpboot.c   | 14 +++++---------
 include/asm-x86_64/processor.h |  4 ++++
 include/asm-x86_64/smp.h       |  2 --
 include/asm-x86_64/topology.h  |  6 ++----
 7 files changed, 29 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d13b241ad094..86e1e022b20e 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -115,7 +115,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 		per_cpu(bank_map, cpu) |= (1 << bank);
 
 #ifdef CONFIG_SMP
-		if (shared_bank[bank] && cpu_core_id[cpu])
+		if (shared_bank[bank] && c->cpu_core_id)
 			continue;
 #endif
 
@@ -323,10 +323,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 	struct threshold_bank *b = NULL;
 
 #ifdef CONFIG_SMP
-	if (cpu_core_id[cpu] && shared_bank[bank]) {	/* symlink */
+	if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {	/* symlink */
 		char name[16];
 		unsigned lcpu = first_cpu(cpu_core_map[cpu]);
-		if (cpu_core_id[lcpu])
+		if (cpu_data[lcpu].cpu_core_id)
 			goto out;	/* first core not up yet */
 
 		b = per_cpu(threshold_banks, lcpu)[bank];
@@ -434,7 +434,7 @@ static __cpuinit int threshold_create_symlinks(unsigned int cpu)
 	int bank, err = 0;
 	unsigned int lcpu = 0;
 
-	if (cpu_core_id[cpu])
+	if (cpu_data[cpu].cpu_core_id)
 		return 0;
 	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
 		if (lcpu == cpu)
@@ -455,7 +455,7 @@ static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
 {
 	int bank;
 	unsigned int lcpu = 0;
-	if (cpu_core_id[cpu])
+	if (cpu_data[cpu].cpu_core_id)
 		return;
 	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
 		if (lcpu == cpu)
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 9a93954bed37..4ca674d16b09 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -680,9 +680,6 @@ void __init gart_iommu_init(void)
 	dma_ops = &gart_dma_ops;
 } 
 
-/* Must execute after PCI subsystem */
-fs_initcall(gart_iommu_init);
-
 void gart_parse_options(char *p)
 {
 	int arg;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 04b2d7b92d17..24aa25ee0d7d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -785,9 +785,9 @@ static int nearby_node(int apicid)
 static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	int cpu = smp_processor_id();
 	unsigned bits;
 #ifdef CONFIG_NUMA
+	int cpu = smp_processor_id();
 	int node = 0;
 	unsigned apicid = hard_smp_processor_id();
 #endif
@@ -805,12 +805,12 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	}
 
 	/* Low order bits define the core id (index of core in socket) */
-	cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
+	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
 	/* Convert the APIC ID into the socket ID */
-	phys_proc_id[cpu] = phys_pkg_id(bits);
+	c->phys_proc_id = phys_pkg_id(bits);
 
 #ifdef CONFIG_NUMA
-  	node = phys_proc_id[cpu];
+  	node = c->phys_proc_id;
  	if (apicid_to_node[apicid] != NUMA_NO_NODE)
  		node = apicid_to_node[apicid];
  	if (!node_online(node)) {
@@ -823,7 +823,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
  		   but in the same order as the HT nodeids.
  		   If that doesn't result in a usable node fall back to the
  		   path for the previous case.  */
- 		int ht_nodeid = apicid - (phys_proc_id[0] << bits);
+ 		int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
  		if (ht_nodeid >= 0 &&
  		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
  			node = apicid_to_node[ht_nodeid];
@@ -834,7 +834,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	numa_set_node(cpu, node);
 
   	printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n",
-  			cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]);
+  			cpu, apicid, c->x86_max_cores, node, c->cpu_core_id);
 #endif
 #endif
 }
@@ -905,7 +905,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
 	int 	index_msb, core_bits;
-	int 	cpu = smp_processor_id();
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
@@ -926,10 +925,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 		}
 
 		index_msb = get_count_order(smp_num_siblings);
-		phys_proc_id[cpu] = phys_pkg_id(index_msb);
+		c->phys_proc_id = phys_pkg_id(index_msb);
 
 		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       phys_proc_id[cpu]);
+		       c->phys_proc_id);
 
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -937,12 +936,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 		core_bits = get_count_order(c->x86_max_cores);
 
-		cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+		c->cpu_core_id = phys_pkg_id(index_msb) &
 					       ((1 << core_bits) - 1);
 
 		if (c->x86_max_cores > 1)
 			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       cpu_core_id[cpu]);
+			       c->cpu_core_id);
 	}
 #endif
 }
@@ -1080,7 +1079,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 #ifdef CONFIG_SMP
-	phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
 }
 
@@ -1288,9 +1287,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 #ifdef CONFIG_SMP
 	if (smp_num_siblings * c->x86_max_cores > 1) {
 		int cpu = c - cpu_data;
-		seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
+		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
-		seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
+		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
 #endif	
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 06535e7687ce..b1c10b154bfe 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -63,10 +63,6 @@
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
-/* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-/* core ID of each logical CPU */
-u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 /* Last level cache ID of each logical CPU */
 u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
@@ -472,8 +468,8 @@ static inline void set_cpu_sibling_map(int cpu)
 
 	if (smp_num_siblings > 1) {
 		for_each_cpu_mask(i, cpu_sibling_setup_map) {
-			if (phys_proc_id[cpu] == phys_proc_id[i] &&
-			    cpu_core_id[cpu] == cpu_core_id[i]) {
+			if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+			    c[cpu].cpu_core_id == c[i].cpu_core_id) {
 				cpu_set(i, cpu_sibling_map[cpu]);
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
@@ -500,7 +496,7 @@ static inline void set_cpu_sibling_map(int cpu)
 			cpu_set(i, c[cpu].llc_shared_map);
 			cpu_set(cpu, c[i].llc_shared_map);
 		}
-		if (phys_proc_id[cpu] == phys_proc_id[i]) {
+		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
 			/*
@@ -1201,8 +1197,8 @@ static void remove_siblinginfo(int cpu)
 		cpu_clear(cpu, cpu_sibling_map[sibling]);
 	cpus_clear(cpu_sibling_map[cpu]);
 	cpus_clear(cpu_core_map[cpu]);
-	phys_proc_id[cpu] = BAD_APICID;
-	cpu_core_id[cpu] = BAD_APICID;
+	c[cpu].phys_proc_id = 0;
+	c[cpu].cpu_core_id = 0;
 	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index e583f0d95209..3b3c1217fe61 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -69,7 +69,11 @@ struct cpuinfo_x86 {
 	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
 #endif
 	__u8	apicid;
+#ifdef CONFIG_SMP
 	__u8	booted_cores;	/* number of cores as seen by OS */
+	__u8	phys_proc_id;	/* Physical Processor id. */
+	__u8	cpu_core_id;	/* Core id. */
+#endif
 } ____cacheline_aligned;
 
 #define X86_VENDOR_INTEL 0
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 7686b9b25aef..6805e1feb300 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -53,8 +53,6 @@ extern int smp_call_function_single(int cpuid, void (*func) (void *info),
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
-extern u8 phys_proc_id[NR_CPUS];
-extern u8 cpu_core_id[NR_CPUS];
 extern u8 cpu_llc_id[NR_CPUS];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index d4009f8dade0..c4e46e7fa7ba 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -55,10 +55,8 @@ extern int __node_distance(int, int);
 #endif
 
 #ifdef CONFIG_SMP
-#define topology_physical_package_id(cpu)				\
-	(phys_proc_id[cpu] == BAD_APICID ? -1 : phys_proc_id[cpu])
-#define topology_core_id(cpu)						\
-	(cpu_core_id[cpu] == BAD_APICID ? 0 : cpu_core_id[cpu])
+#define topology_physical_package_id(cpu)	(cpu_data[cpu].phys_proc_id)
+#define topology_core_id(cpu)			(cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
 #endif
-- 
cgit v1.2.3


From 05ebb76109f302b949e745724bbf0f0634dba43f Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:20 +0200
Subject: [PATCH] x86_64: Add useful constants to time.h

In timekeeping code, one often does need to use conversion constants. Naming
these leads to code that's easier to understand, showing the reader between
which units the conversion is made.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/time.h b/include/linux/time.h
index 0cd696cee998..2fa2987c9a07 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -28,10 +28,13 @@ struct timezone {
 #ifdef __KERNEL__
 
 /* Parameters used to convert the timespec values: */
-#define MSEC_PER_SEC		1000L
-#define USEC_PER_SEC		1000000L
-#define NSEC_PER_SEC		1000000000L
-#define NSEC_PER_USEC		1000L
+#define MSEC_PER_SEC	1000L
+#define USEC_PER_MSEC	1000L
+#define NSEC_PER_USEC	1000L
+#define NSEC_PER_MSEC	1000000L
+#define USEC_PER_SEC	1000000L
+#define NSEC_PER_SEC	1000000000L
+#define FSEC_PER_SEC	1000000000000000L
 
 static inline int timespec_equal(struct timespec *a, struct timespec *b)
 {
-- 
cgit v1.2.3


From f8bf3c65a962530821fa9a9b4bad43d8c7e13574 Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:23 +0200
Subject: [PATCH] x86_64: Rename oem_force_hpet_timer to apic_is_clustered_box

Rename oem_force_hpet_timer to apic_is_clustered_box, to give the
function a better fitting name - it really isn't at all about HPET.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 7 +++----
 arch/x86_64/kernel/time.c | 4 ++--
 include/asm-x86_64/hpet.h | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 29ef99001e05..53fc17d894e8 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -983,7 +983,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
 }
 
 /*
- * oem_force_hpet_timer -- force HPET mode for some boxes.
+ * apic_is_clustered_box() -- Check if we can expect good TSC
  *
  * Thus far, the major user of this is IBM's Summit2 series:
  *
@@ -991,7 +991,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
  * multi-chassis. Use available data to take a good guess.
  * If in doubt, go HPET.
  */
-__cpuinit int oem_force_hpet_timer(void)
+__cpuinit int apic_is_clustered_box(void)
 {
 	int i, clusters, zeros;
 	unsigned id;
@@ -1022,8 +1022,7 @@ __cpuinit int oem_force_hpet_timer(void)
 	}
 
 	/*
-	 * If clusters > 2, then should be multi-chassis.  Return 1 for HPET.
-	 * Else return 0 to use TSC.
+	 * If clusters > 2, then should be multi-chassis.
 	 * May have to revisit this when multi-core + hyperthreaded CPUs come
 	 * out, but AFAIK this will work even for them.
 	 */
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7392570f975d..54b14e3ef66f 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -956,10 +956,10 @@ void __init time_init(void)
 __cpuinit int unsynchronized_tsc(void)
 {
 #ifdef CONFIG_SMP
-	if (oem_force_hpet_timer())
+	if (apic_is_clustered_box())
 		return 1;
  	/* Intel systems are normally all synchronized. Exceptions
- 	   are handled in the OEM check above. */
+ 	   are handled in the check above. */
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  		return 0;
 #endif
diff --git a/include/asm-x86_64/hpet.h b/include/asm-x86_64/hpet.h
index 18ff7ee9e774..b39098408b69 100644
--- a/include/asm-x86_64/hpet.h
+++ b/include/asm-x86_64/hpet.h
@@ -55,7 +55,7 @@
 
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
-extern int oem_force_hpet_timer(void);
+extern int apic_is_clustered_box(void);
 
 extern int hpet_use_timer;
 
-- 
cgit v1.2.3


From 7b0e85012581a9a8b12a8191d365716bf3ca3d3c Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:26 +0200
Subject: [PATCH] x86_64: Add X86_FEATURE_RDTSCP, fix rdtscp in /proc/cpuinfo

This patch adds the X86_FEATURE_RDTSCP #define, so that kernel code can
check for the feature easily and also fixes the location of the "rdtscp"
string in the cpuinfo tables.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c      | 2 +-
 include/asm-x86_64/cpufeature.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 24aa25ee0d7d..923b76fb0aa6 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1206,7 +1206,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+		NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
 
 		/* Transmeta-defined */
 		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index afc44e557400..fbf5a19edbb3 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -46,6 +46,7 @@
 #define X86_FEATURE_SYSCALL	(1*32+11) /* SYSCALL/SYSRET */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 #define X86_FEATURE_FXSR_OPT	(1*32+25) /* FXSR optimizations */
+#define X86_FEATURE_RDTSCP	(1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
 #define X86_FEATURE_3DNOW	(1*32+31) /* 3DNow! */
-- 
cgit v1.2.3


From 17fc14ff1bdbc393e1cf4f6fd1e1e53d72ab9fe5 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:47 +0200
Subject: [PATCH] x86_64: apic support for extended apic interrupt

Add support for extended APIC LVT found in future AMD processors.

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 10 ++++------
 include/asm-x86_64/apic.h | 11 ++++++++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 53fc17d894e8..396e125cb212 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -909,15 +909,13 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-#ifdef CONFIG_X86_MCE_AMD
-void setup_threshold_lvt(unsigned long lvt_off)
+void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
+			    unsigned char msg_type, unsigned char mask)
 {
-	unsigned int v = 0;
-	unsigned long reg = (lvt_off << 4) + 0x500;
-	v |= THRESHOLD_APIC_VECTOR;
+	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 	apic_write(reg, v);
 }
-#endif /* CONFIG_X86_MCE_AMD */
 
 #undef APIC_DIVISOR
 
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index c9e6c2501a48..9d43ac8519bf 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -84,9 +84,18 @@ extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 extern void clustered_apic_check(void);
 
+extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
+				   unsigned char msg_type, unsigned char mask);
+
+#define K8_APIC_EXT_LVT_BASE    0x500
+#define K8_APIC_EXT_INT_MSG_FIX 0x0
+#define K8_APIC_EXT_INT_MSG_SMI 0x2
+#define K8_APIC_EXT_INT_MSG_NMI 0x4
+#define K8_APIC_EXT_INT_MSG_EXT 0x7
+#define K8_APIC_EXT_LVT_ENTRY_THRESHOLD    0
+
 extern int disable_timer_pin_1;
 
-extern void setup_threshold_lvt(unsigned long lvt_off);
 
 void smp_send_timer_broadcast_ipi(void);
 void switch_APIC_timer_to_ipi(void *cpumask);
-- 
cgit v1.2.3


From fff2e89f11dd9b9b45e9212bc543154ca3d028a1 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:50 +0200
Subject: [PATCH] x86_64: mce_amd relocate sysfs files

Get rid of /sys/devices/system/threshold directory and move
mce_amd thresholding files into the machine sysfs directory --
/sys/devices/system/machinecheck.

AK: Fixed warning

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c     |  2 +-
 arch/x86_64/kernel/mce_amd.c | 40 ++++++++++------------------------------
 include/asm-x86_64/mce.h     |  2 ++
 3 files changed, 13 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index c69fc43cee7b..acd5816b1a6f 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -562,7 +562,7 @@ static struct sysdev_class mce_sysclass = {
 	set_kset_name("machinecheck"),
 };
 
-static DEFINE_PER_CPU(struct sys_device, device_mce);
+DEFINE_PER_CPU(struct sys_device, device_mce);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 86e1e022b20e..b96682e5ff77 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -30,7 +30,7 @@
 #include <asm/idle.h>
 
 #define PFX "mce_threshold: "
-#define VERSION "version 1.00.9"
+#define VERSION "version 1.0.10"
 #define NR_BANKS 5
 #define THRESHOLD_MAX 0xFFF
 #define INT_TYPE_APIC 0x00020000
@@ -166,12 +166,6 @@ asmlinkage void mce_threshold_interrupt(void)
  * Sysfs Interface
  */
 
-static struct sysdev_class threshold_sysclass = {
-	set_kset_name("threshold"),
-};
-
-static DEFINE_PER_CPU(struct sys_device, device_threshold);
-
 struct threshold_attr {
         struct attribute attr;
         ssize_t(*show) (struct threshold_bank *, char *);
@@ -332,8 +326,8 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 		b = per_cpu(threshold_banks, lcpu)[bank];
 		if (!b)
 			goto out;
-		sprintf(name, "bank%i", bank);
-		err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+		sprintf(name, "threshold_bank%i", bank);
+		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
 					&b->kobj, name);
 		if (err)
 			goto out;
@@ -353,8 +347,8 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 	b->bank = bank;
 	b->interrupt_enable = 0;
 	b->threshold_limit = THRESHOLD_MAX;
-	kobject_set_name(&b->kobj, "bank%i", bank);
-	b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
+	kobject_set_name(&b->kobj, "threshold_bank%i", bank);
+	b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
 	b->kobj.ktype = &threshold_ktype;
 
 	err = kobject_register(&b->kobj);
@@ -373,12 +367,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 	int bank;
 	int err = 0;
 
-	per_cpu(device_threshold, cpu).id = cpu;
-	per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
-	err = sysdev_register(&per_cpu(device_threshold, cpu));
-	if (err)
-		goto out;
-
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & 1 << bank))
 			continue;
@@ -407,8 +395,8 @@ static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
 	if (!b)
 		return;
 	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
-		sprintf(name, "bank%i", bank);
-		sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+		sprintf(name, "threshold_bank%i", bank);
+		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 	} else {
 		kobject_unregister(&b->kobj);
@@ -425,7 +413,6 @@ static __cpuinit void threshold_remove_device(unsigned int cpu)
 			continue;
 		threshold_remove_bank(cpu, bank);
 	}
-	sysdev_unregister(&per_cpu(device_threshold, cpu));
 }
 
 /* link all existing siblings when first core comes up */
@@ -518,23 +505,16 @@ static struct notifier_block threshold_cpu_notifier = {
 
 static __init int threshold_init_device(void)
 {
-	int err;
 	int lcpu = 0;
 
-	err = sysdev_class_register(&threshold_sysclass);
-	if (err)
-		goto out;
-
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
-		err = threshold_create_device(lcpu);
+		int err = threshold_create_device(lcpu);
 		if (err)
-			goto out;
+			return err;
 	}
 	register_cpu_notifier(&threshold_cpu_notifier);
-
-      out:
-	return err;
+	return 0;
 }
 
 device_initcall(threshold_init_device);
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 7229785094e3..08f721e1b7e8 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -74,6 +74,8 @@ struct mce_log {
 #include <asm/atomic.h>
 
 void mce_log(struct mce *m);
+DECLARE_PER_CPU(struct sys_device, device_mce);
+
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
 #else
-- 
cgit v1.2.3


From 95268664390b19962ed41a3506c5bc8149db71e8 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:53 +0200
Subject: [PATCH] x86_64: mce_amd support for family 0x10 processors

Add support for mce threshold registers found in future
AMD family 0x10 processors.  Backwards compatible with
family 0xF hardware.

AK: fixed build on !SMP

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c | 362 ++++++++++++++++++++++++++++++++++---------
 include/asm-x86_64/mce.h     |  11 +-
 2 files changed, 297 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index b96682e5ff77..10ffbe52939c 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005 Advanced Micro Devices, Inc.
+ *  (c) 2005, 2006 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -8,9 +8,10 @@
  *
  *  Support : jacob.shin@amd.com
  *
- *  MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
- *  MC4_MISC0 exists per physical processor.
+ *  April 2006
+ *     - added support for AMD Family 0x10 processors
  *
+ *  All MC4_MISCi registers are shared between multi-cores
  */
 
 #include <linux/cpu.h>
@@ -30,8 +31,9 @@
 #include <asm/idle.h>
 
 #define PFX "mce_threshold: "
-#define VERSION "version 1.0.10"
-#define NR_BANKS 5
+#define VERSION "version 1.1.0"
+#define NR_BANKS 6
+#define NR_BLOCKS 9
 #define THRESHOLD_MAX 0xFFF
 #define INT_TYPE_APIC 0x00020000
 #define MASK_VALID_HI 0x80000000
@@ -40,21 +42,33 @@
 #define MASK_INT_TYPE_HI 0x00060000
 #define MASK_OVERFLOW_HI 0x00010000
 #define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_OVERFLOW 0x0001000000000000L
+#define MASK_BLKPTR_LO    0xFF000000
+#define MCG_XBLK_ADDR     0xC0000400
 
-struct threshold_bank {
+struct threshold_block {
+	unsigned int block;
+	unsigned int bank;
 	unsigned int cpu;
-	u8 bank;
-	u8 interrupt_enable;
+	u32 address;
+	u16 interrupt_enable;
 	u16 threshold_limit;
 	struct kobject kobj;
+	struct list_head miscj;
 };
 
-static struct threshold_bank threshold_defaults = {
+/* defaults used early on boot */
+static struct threshold_block threshold_defaults = {
 	.interrupt_enable = 0,
 	.threshold_limit = THRESHOLD_MAX,
 };
 
+struct threshold_bank {
+	struct kobject kobj;
+	struct threshold_block *blocks;
+	cpumask_t cpus;
+};
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
 #ifdef CONFIG_SMP
 static unsigned char shared_bank[NR_BANKS] = {
 	0, 0, 0, 0, 1
@@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
  */
 
 /* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_bank *b,
+static void threshold_restart_bank(struct threshold_block *b,
 				   int reset, u16 old_limit)
 {
 	u32 mci_misc_hi, mci_misc_lo;
 
-	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+	rdmsr(b->address, mci_misc_lo, mci_misc_hi);
 
 	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
 		reset = 1;	/* limit cannot be lower than err count */
@@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b,
 	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
 
 	mci_misc_hi |= MASK_COUNT_EN_HI;
-	wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+	wrmsr(b->address, mci_misc_lo, mci_misc_hi);
 }
 
+/* cpu init entry point, called from mce.c with preempt off */
 void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	int bank;
-	u32 mci_misc_lo, mci_misc_hi;
+	unsigned int bank, block;
 	unsigned int cpu = smp_processor_id();
+	u32 low = 0, high = 0, address = 0;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+		for (block = 0; block < NR_BLOCKS; ++block) {
+			if (block == 0)
+				address = MSR_IA32_MC0_MISC + bank * 4;
+			else if (block == 1)
+				address = MCG_XBLK_ADDR
+					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else
+				++address;
+
+			if (rdmsr_safe(address, &low, &high))
+				continue;
 
-		/* !valid, !counter present, bios locked */
-		if (!(mci_misc_hi & MASK_VALID_HI) ||
-		    !(mci_misc_hi & MASK_VALID_HI >> 1) ||
-		    (mci_misc_hi & MASK_VALID_HI >> 2))
-			continue;
+			if (!(high & MASK_VALID_HI)) {
+				if (block)
+					continue;
+				else
+					break;
+			}
 
-		per_cpu(bank_map, cpu) |= (1 << bank);
+			if (!(high & MASK_VALID_HI >> 1)  ||
+			     (high & MASK_VALID_HI >> 2))
+				continue;
 
+			if (!block)
+				per_cpu(bank_map, cpu) |= (1 << bank);
 #ifdef CONFIG_SMP
-		if (shared_bank[bank] && c->cpu_core_id)
-			continue;
+			if (shared_bank[bank] && c->cpu_core_id)
+				break;
 #endif
+			high &= ~MASK_LVTOFF_HI;
+			high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+			wrmsr(address, low, high);
+
+			setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+					       THRESHOLD_APIC_VECTOR,
+					       K8_APIC_EXT_INT_MSG_FIX, 0);
 
-		setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
-		threshold_defaults.cpu = cpu;
-		threshold_defaults.bank = bank;
-		threshold_restart_bank(&threshold_defaults, 0, 0);
+			threshold_defaults.address = address;
+			threshold_restart_bank(&threshold_defaults, 0, 0);
+		}
 	}
 }
 
@@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
  */
 asmlinkage void mce_threshold_interrupt(void)
 {
-	int bank;
+	unsigned int bank, block;
 	struct mce m;
+	u32 low = 0, high = 0, address = 0;
 
 	ack_APIC_irq();
 	exit_idle();
@@ -150,12 +187,39 @@ asmlinkage void mce_threshold_interrupt(void)
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		m.bank = MCE_THRESHOLD_BASE + bank;
-		rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+		for (block = 0; block < NR_BLOCKS; ++block) {
+			if (block == 0)
+				address = MSR_IA32_MC0_MISC + bank * 4;
+			else if (block == 1)
+				address = MCG_XBLK_ADDR
+					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else
+				++address;
+
+			if (rdmsr_safe(address, &low, &high))
+				continue;
 
-		if (m.misc & MASK_OVERFLOW) {
-			mce_log(&m);
-			goto out;
+			if (!(high & MASK_VALID_HI)) {
+				if (block)
+					continue;
+				else
+					break;
+			}
+
+			if (!(high & MASK_VALID_HI >> 1)  ||
+			     (high & MASK_VALID_HI >> 2))
+				continue;
+
+			if (high & MASK_OVERFLOW_HI) {
+				rdmsrl(address, m.misc);
+				rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
+				       m.status);
+				m.bank = K8_MCE_THRESHOLD_BASE
+				       + bank * NR_BLOCKS
+				       + block;
+				mce_log(&m);
+				goto out;
+			}
 		}
 	}
       out:
@@ -168,12 +232,10 @@ asmlinkage void mce_threshold_interrupt(void)
 
 struct threshold_attr {
         struct attribute attr;
-        ssize_t(*show) (struct threshold_bank *, char *);
-        ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+	ssize_t(*show) (struct threshold_block *, char *);
+	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
 
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
-
 static cpumask_t affinity_set(unsigned int cpu)
 {
 	cpumask_t oldmask = current->cpus_allowed;
@@ -189,14 +251,14 @@ static void affinity_restore(cpumask_t oldmask)
 }
 
 #define SHOW_FIELDS(name) \
-        static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
+        static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
         { \
                 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
         }
 SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
 
-static ssize_t store_interrupt_enable(struct threshold_bank *b,
+static ssize_t store_interrupt_enable(struct threshold_block *b,
 				      const char *buf, size_t count)
 {
 	char *end;
@@ -213,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b,
 	return end - buf;
 }
 
-static ssize_t store_threshold_limit(struct threshold_bank *b,
+static ssize_t store_threshold_limit(struct threshold_block *b,
 				     const char *buf, size_t count)
 {
 	char *end;
@@ -236,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b,
 	return end - buf;
 }
 
-static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
 	u32 high, low;
 	cpumask_t oldmask;
 	oldmask = affinity_set(b->cpu);
-	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+	rdmsr(b->address, low, high);
 	affinity_restore(oldmask);
 	return sprintf(buf, "%x\n",
 		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
 }
 
-static ssize_t store_error_count(struct threshold_bank *b,
+static ssize_t store_error_count(struct threshold_block *b,
 				 const char *buf, size_t count)
 {
 	cpumask_t oldmask;
@@ -278,12 +340,12 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
-#define to_bank(k) container_of(k,struct threshold_bank,kobj)
+#define to_block(k) container_of(k, struct threshold_block, kobj)
 #define to_attr(a) container_of(a,struct threshold_attr,attr)
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
-	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
 	ret = a->show ? a->show(b, buf) : -EIO;
@@ -293,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 static ssize_t store(struct kobject *kobj, struct attribute *attr,
 		     const char *buf, size_t count)
 {
-	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
 	ret = a->store ? a->store(b, buf, count) : -EIO;
@@ -310,53 +372,164 @@ static struct kobj_type threshold_ktype = {
 	.default_attrs = default_attrs,
 };
 
+static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
+					       unsigned int bank,
+					       unsigned int block,
+					       u32 address)
+{
+	int err;
+	u32 low, high;
+	struct threshold_block *b = NULL;
+
+	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+		return 0;
+
+	if (rdmsr_safe(address, &low, &high))
+		goto recurse;
+
+	if (!(high & MASK_VALID_HI)) {
+		if (block)
+			goto recurse;
+		else
+			return 0;
+	}
+
+	if (!(high & MASK_VALID_HI >> 1)  ||
+	     (high & MASK_VALID_HI >> 2))
+		goto recurse;
+
+	b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	memset(b, 0, sizeof(struct threshold_block));
+
+	b->block = block;
+	b->bank = bank;
+	b->cpu = cpu;
+	b->address = address;
+	b->interrupt_enable = 0;
+	b->threshold_limit = THRESHOLD_MAX;
+
+	INIT_LIST_HEAD(&b->miscj);
+
+	if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+		list_add(&b->miscj,
+			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+	else
+		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+
+	kobject_set_name(&b->kobj, "misc%i", block);
+	b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+	b->kobj.ktype = &threshold_ktype;
+	err = kobject_register(&b->kobj);
+	if (err)
+		goto out_free;
+recurse:
+	if (!block) {
+		address = (low & MASK_BLKPTR_LO) >> 21;
+		if (!address)
+			return 0;
+		address += MCG_XBLK_ADDR;
+	} else
+		++address;
+
+	err = allocate_threshold_blocks(cpu, bank, ++block, address);
+	if (err)
+		goto out_free;
+
+	return err;
+
+out_free:
+	if (b) {
+		kobject_unregister(&b->kobj);
+		kfree(b);
+	}
+	return err;
+}
+
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
-	int err = 0;
+	int i, err = 0;
 	struct threshold_bank *b = NULL;
+	cpumask_t oldmask = CPU_MASK_NONE;
+	char name[32];
+
+	sprintf(name, "threshold_bank%i", bank);
 
 #ifdef CONFIG_SMP
 	if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {	/* symlink */
-		char name[16];
-		unsigned lcpu = first_cpu(cpu_core_map[cpu]);
-		if (cpu_data[lcpu].cpu_core_id)
-			goto out;	/* first core not up yet */
+		i = first_cpu(cpu_core_map[cpu]);
+
+		/* first core not up yet */
+		if (cpu_data[i].cpu_core_id)
+			goto out;
+
+		/* already linked */
+		if (per_cpu(threshold_banks, cpu)[bank])
+			goto out;
+
+		b = per_cpu(threshold_banks, i)[bank];
 
-		b = per_cpu(threshold_banks, lcpu)[bank];
 		if (!b)
 			goto out;
-		sprintf(name, "threshold_bank%i", bank);
+
 		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
 					&b->kobj, name);
 		if (err)
 			goto out;
+
+		b->cpus = cpu_core_map[cpu];
 		per_cpu(threshold_banks, cpu)[bank] = b;
 		goto out;
 	}
 #endif
 
-	b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
 	if (!b) {
 		err = -ENOMEM;
 		goto out;
 	}
 	memset(b, 0, sizeof(struct threshold_bank));
 
-	b->cpu = cpu;
-	b->bank = bank;
-	b->interrupt_enable = 0;
-	b->threshold_limit = THRESHOLD_MAX;
 	kobject_set_name(&b->kobj, "threshold_bank%i", bank);
 	b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
-	b->kobj.ktype = &threshold_ktype;
-
+#ifndef CONFIG_SMP
+	b->cpus = CPU_MASK_ALL;
+#else
+	b->cpus = cpu_core_map[cpu];
+#endif
 	err = kobject_register(&b->kobj);
-	if (err) {
-		kfree(b);
-		goto out;
-	}
+	if (err)
+		goto out_free;
+
 	per_cpu(threshold_banks, cpu)[bank] = b;
+
+	oldmask = affinity_set(cpu);
+	err = allocate_threshold_blocks(cpu, bank, 0,
+					MSR_IA32_MC0_MISC + bank * 4);
+	affinity_restore(oldmask);
+
+	if (err)
+		goto out_free;
+
+	for_each_cpu_mask(i, b->cpus) {
+		if (i == cpu)
+			continue;
+
+		err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+					&b->kobj, name);
+		if (err)
+			goto out;
+
+		per_cpu(threshold_banks, i)[bank] = b;
+	}
+
+	goto out;
+
+out_free:
+	per_cpu(threshold_banks, cpu)[bank] = NULL;
+	kfree(b);
       out:
 	return err;
 }
@@ -385,23 +558,64 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
  *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
  */
 
-/* cpu hotplug call removes all symlinks before first core dies */
+static __cpuinit void deallocate_threshold_block(unsigned int cpu,
+						 unsigned int bank)
+{
+	struct threshold_block *pos = NULL;
+	struct threshold_block *tmp = NULL;
+	struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+	if (!head)
+		return;
+
+	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+		kobject_unregister(&pos->kobj);
+		list_del(&pos->miscj);
+		kfree(pos);
+	}
+
+	kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
 static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
 {
+	int i = 0;
 	struct threshold_bank *b;
-	char name[16];
+	char name[32];
 
 	b = per_cpu(threshold_banks, cpu)[bank];
+
 	if (!b)
 		return;
-	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
-		sprintf(name, "threshold_bank%i", bank);
+
+	if (!b->blocks)
+		goto free_out;
+
+	sprintf(name, "threshold_bank%i", bank);
+
+	/* sibling symlink */
+	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
-		per_cpu(threshold_banks, cpu)[bank] = NULL;
-	} else {
-		kobject_unregister(&b->kobj);
-		kfree(per_cpu(threshold_banks, cpu)[bank]);
+		per_cpu(threshold_banks, i)[bank] = NULL;
+		return;
 	}
+
+	/* remove all sibling symlinks before unregistering */
+	for_each_cpu_mask(i, b->cpus) {
+		if (i == cpu)
+			continue;
+
+		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+		per_cpu(threshold_banks, i)[bank] = NULL;
+	}
+
+	deallocate_threshold_block(cpu, bank);
+
+free_out:
+	kobject_unregister(&b->kobj);
+	kfree(b);
+	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
 
 static __cpuinit void threshold_remove_device(unsigned int cpu)
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 08f721e1b7e8..d13687dfd691 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -67,8 +67,15 @@ struct mce_log {
 /* Software defined banks */
 #define MCE_EXTENDED_BANK	128
 #define MCE_THERMAL_BANK	MCE_EXTENDED_BANK + 0
-#define MCE_THRESHOLD_BASE      MCE_EXTENDED_BANK + 1 /* MCE_AMD */
-#define MCE_THRESHOLD_DRAM_ECC  MCE_THRESHOLD_BASE + 4
+
+#define K8_MCE_THRESHOLD_BASE      (MCE_EXTENDED_BANK + 1)      /* MCE_AMD */
+#define K8_MCE_THRESHOLD_BANK_0    (MCE_THRESHOLD_BASE + 0 * 9)
+#define K8_MCE_THRESHOLD_BANK_1    (MCE_THRESHOLD_BASE + 1 * 9)
+#define K8_MCE_THRESHOLD_BANK_2    (MCE_THRESHOLD_BASE + 2 * 9)
+#define K8_MCE_THRESHOLD_BANK_3    (MCE_THRESHOLD_BASE + 3 * 9)
+#define K8_MCE_THRESHOLD_BANK_4    (MCE_THRESHOLD_BASE + 4 * 9)
+#define K8_MCE_THRESHOLD_BANK_5    (MCE_THRESHOLD_BASE + 5 * 9)
+#define K8_MCE_THRESHOLD_DRAM_ECC  (MCE_THRESHOLD_BANK_4 + 0)
 
 #ifdef __KERNEL__
 #include <asm/atomic.h>
-- 
cgit v1.2.3


From 495ab9c045e1b0e5c82951b762257fe1c9d81564 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:11 +0200
Subject: [PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status

During some profiling I noticed that default_idle causes a lot of
memory traffic. I think that is caused by the atomic operations
to clear/set the polling flag in thread_info. There is actually
no reason to make this atomic - only the idle thread does it
to itself, other CPUs only read it. So I moved it into ti->status.

Converted i386/x86-64/ia64 for now because that was the easiest
way to fix ACPI which also manipulates these flags in its idle
function.

Cc: Nick Piggin <npiggin@novell.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c           |  6 +++---
 arch/i386/kernel/process.c       |  6 +++---
 arch/ia64/kernel/process.c       |  4 ++--
 arch/x86_64/kernel/process.c     |  7 +++----
 drivers/acpi/processor_idle.c    | 12 ++++++------
 include/asm-i386/thread_info.h   |  7 ++++---
 include/asm-ia64/thread_info.h   |  5 +++++
 include/asm-x86_64/thread_info.h |  6 ++++--
 kernel/sched.c                   |  9 +++++++--
 9 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 9e819eb68229..7c5729d1fd06 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -764,9 +764,9 @@ static int apm_do_idle(void)
 	int	idled = 0;
 	int	polling;
 
-	polling = test_thread_flag(TIF_POLLING_NRFLAG);
+	polling = !!(current_thread_info()->status & TS_POLLING);
 	if (polling) {
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
 	}
 	if (!need_resched()) {
@@ -774,7 +774,7 @@ static int apm_do_idle(void)
 		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
 	}
 	if (polling)
-		set_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status |= TS_POLLING;
 
 	if (!idled)
 		return 0;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 525432e3fef7..6946b06e2784 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -102,7 +102,7 @@ void default_idle(void)
 	local_irq_enable();
 
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
 		while (!need_resched()) {
 			local_irq_disable();
@@ -111,7 +111,7 @@ void default_idle(void)
 			else
 				local_irq_enable();
 		}
-		set_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status |= TS_POLLING;
 	} else {
 		while (!need_resched())
 			cpu_relax();
@@ -174,7 +174,7 @@ void cpu_idle(void)
 {
 	int cpu = smp_processor_id();
 
-	set_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status |= TS_POLLING;
 
 	/* endless idle loop with no priority at all */
 	while (1) {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 355d57970ba3..b045c279136c 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -272,9 +272,9 @@ cpu_idle (void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		if (can_do_pal_halt)
-			clear_thread_flag(TIF_POLLING_NRFLAG);
+			current_thread_info()->status &= ~TS_POLLING;
 		else
-			set_thread_flag(TIF_POLLING_NRFLAG);
+			current_thread_info()->status |= TS_POLLING;
 
 		if (!need_resched()) {
 			void (*idle)(void);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index d6fa41459c80..b596837a1527 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -110,7 +110,7 @@ static void default_idle(void)
 {
 	local_irq_enable();
 
-	clear_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
 	while (!need_resched()) {
 		local_irq_disable();
@@ -119,7 +119,7 @@ static void default_idle(void)
 		else
 			local_irq_enable();
 	}
-	set_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status |= TS_POLLING;
 }
 
 /*
@@ -202,8 +202,7 @@ static inline void play_dead(void)
  */
 void cpu_idle (void)
 {
-	set_thread_flag(TIF_POLLING_NRFLAG);
-
+	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 3b97a5eae9e8..74173ce6aaf4 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -206,11 +206,11 @@ acpi_processor_power_activate(struct acpi_processor *pr,
 
 static void acpi_safe_halt(void)
 {
-	clear_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
 	if (!need_resched())
 		safe_halt();
-	set_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status |= TS_POLLING;
 }
 
 static atomic_t c3_cpu_count;
@@ -330,10 +330,10 @@ static void acpi_processor_idle(void)
 	 * Invoke the current Cx state to put the processor to sleep.
 	 */
 	if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
 		if (need_resched()) {
-			set_thread_flag(TIF_POLLING_NRFLAG);
+			current_thread_info()->status |= TS_POLLING;
 			local_irq_enable();
 			return;
 		}
@@ -371,7 +371,7 @@ static void acpi_processor_idle(void)
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
 		/* Re-enable interrupts */
 		local_irq_enable();
-		set_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status |= TS_POLLING;
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
 		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
@@ -411,7 +411,7 @@ static void acpi_processor_idle(void)
 
 		/* Re-enable interrupts */
 		local_irq_enable();
-		set_thread_flag(TIF_POLLING_NRFLAG);
+		current_thread_info()->status |= TS_POLLING;
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
 		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 8420ed12491e..fdbc7f422ea5 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -140,8 +140,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
-#define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		17
+#define TIF_MEMDIE		16
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -153,7 +152,6 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
-#define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -170,6 +168,9 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
  * have to worry about atomic accesses.
  */
 #define TS_USEDFPU		0x0001	/* FPU was used by this task this quantum (SMP) */
+#define TS_POLLING		0x0002	/* True if in idle loop and not sleeping */
+
+#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index e5392c4d30c6..8bc9869e5765 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -27,6 +27,7 @@ struct thread_info {
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
 	__u32 last_cpu;			/* Last CPU thread ran on */
+	__u32 status;			/* Thread synchronous flags */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
@@ -103,4 +104,8 @@ struct thread_info {
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
 #define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 
+#define TS_POLLING		1 	/* true if in idle loop and not sleeping */
+
+#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
+
 #endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 4ac0e0a36934..b5e88216fd80 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -101,7 +101,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+/* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_ABI_PENDING		19
@@ -115,7 +115,6 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
-#define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -137,6 +136,9 @@ static inline struct thread_info *stack_thread_info(void)
  */
 #define TS_USEDFPU		0x0001	/* FPU was used by this task this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active */
+#define TS_POLLING		0x0004	/* true if in idle loop and not sleeping */
+
+#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f06d059edef5..7d1027a4dd21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,6 +818,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
  * the target CPU.
  */
 #ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
 static void resched_task(task_t *p)
 {
 	int cpu;
@@ -833,9 +838,9 @@ static void resched_task(task_t *p)
 	if (cpu == smp_processor_id())
 		return;
 
-	/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
+	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
-	if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 #else
-- 
cgit v1.2.3


From da5311258d2afb96fc592c8b11e818facfa46dc3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:20 +0200
Subject: [PATCH] x86_64: Fix race in cpu_local_* on preemptible kernels

When a process changes CPUs while doing the non atomic cpu_local_*
operations it might operate on the local_t of a different CPUs.

Fix that by disabling preemption.

Pointed out by Christopher Lameter

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/local.h   | 26 ++++++++++++++++++++------
 include/asm-x86_64/local.h | 26 ++++++++++++++++++++------
 2 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/local.h b/include/asm-i386/local.h
index e67fa08260fe..3b4998c51d08 100644
--- a/include/asm-i386/local.h
+++ b/include/asm-i386/local.h
@@ -55,12 +55,26 @@ static __inline__ void local_sub(long i, local_t *v)
  * much more efficient than these naive implementations.  Note they take
  * a variable, not an address.
  */
-#define cpu_local_read(v)	local_read(&__get_cpu_var(v))
-#define cpu_local_set(v, i)	local_set(&__get_cpu_var(v), (i))
-#define cpu_local_inc(v)	local_inc(&__get_cpu_var(v))
-#define cpu_local_dec(v)	local_dec(&__get_cpu_var(v))
-#define cpu_local_add(i, v)	local_add((i), &__get_cpu_var(v))
-#define cpu_local_sub(i, v)	local_sub((i), &__get_cpu_var(v))
+
+/* Need to disable preemption for the cpu local counters otherwise we could
+   still access a variable of a previous CPU in a non atomic way. */
+#define cpu_local_wrap_v(v)	 	\
+	({ local_t res__;		\
+	   preempt_disable(); 		\
+	   res__ = (v);			\
+	   preempt_enable();		\
+	   res__; })
+#define cpu_local_wrap(v)		\
+	({ preempt_disable();		\
+	   v;				\
+	   preempt_enable(); })		\
+
+#define cpu_local_read(v)    cpu_local_wrap_v(local_read(&__get_cpu_var(v)))
+#define cpu_local_set(v, i)  cpu_local_wrap(local_set(&__get_cpu_var(v), (i)))
+#define cpu_local_inc(v)     cpu_local_wrap(local_inc(&__get_cpu_var(v)))
+#define cpu_local_dec(v)     cpu_local_wrap(local_dec(&__get_cpu_var(v)))
+#define cpu_local_add(i, v)  cpu_local_wrap(local_add((i), &__get_cpu_var(v)))
+#define cpu_local_sub(i, v)  cpu_local_wrap(local_sub((i), &__get_cpu_var(v)))
 
 #define __cpu_local_inc(v)	cpu_local_inc(v)
 #define __cpu_local_dec(v)	cpu_local_dec(v)
diff --git a/include/asm-x86_64/local.h b/include/asm-x86_64/local.h
index cd17945bf218..e769e6200225 100644
--- a/include/asm-x86_64/local.h
+++ b/include/asm-x86_64/local.h
@@ -59,12 +59,26 @@ static inline void local_sub(long i, local_t *v)
  * This could be done better if we moved the per cpu data directly
  * after GS.
  */
-#define cpu_local_read(v)	local_read(&__get_cpu_var(v))
-#define cpu_local_set(v, i)	local_set(&__get_cpu_var(v), (i))
-#define cpu_local_inc(v)	local_inc(&__get_cpu_var(v))
-#define cpu_local_dec(v)	local_dec(&__get_cpu_var(v))
-#define cpu_local_add(i, v)	local_add((i), &__get_cpu_var(v))
-#define cpu_local_sub(i, v)	local_sub((i), &__get_cpu_var(v))
+
+/* Need to disable preemption for the cpu local counters otherwise we could
+   still access a variable of a previous CPU in a non atomic way. */
+#define cpu_local_wrap_v(v)	 	\
+	({ local_t res__;		\
+	   preempt_disable(); 		\
+	   res__ = (v);			\
+	   preempt_enable();		\
+	   res__; })
+#define cpu_local_wrap(v)		\
+	({ preempt_disable();		\
+	   v;				\
+	   preempt_enable(); })		\
+
+#define cpu_local_read(v)    cpu_local_wrap_v(local_read(&__get_cpu_var(v)))
+#define cpu_local_set(v, i)  cpu_local_wrap(local_set(&__get_cpu_var(v), (i)))
+#define cpu_local_inc(v)     cpu_local_wrap(local_inc(&__get_cpu_var(v)))
+#define cpu_local_dec(v)     cpu_local_wrap(local_dec(&__get_cpu_var(v)))
+#define cpu_local_add(i, v)  cpu_local_wrap(local_add((i), &__get_cpu_var(v)))
+#define cpu_local_sub(i, v)  cpu_local_wrap(local_sub((i), &__get_cpu_var(v)))
 
 #define __cpu_local_inc(v)	cpu_local_inc(v)
 #define __cpu_local_dec(v)	cpu_local_dec(v)
-- 
cgit v1.2.3


From 899ced0dd9457b9c349663ca4cb4ec09167728a6 Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@rhlx01.fht-esslingen.de>
Date: Mon, 26 Jun 2006 13:59:29 +0200
Subject: [PATCH] x86_64: Add cpu_relax to apic_wait_icr_idle

This one is adding a cpu_relax() that already existed in the i386 version.

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/apic.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 9d43ac8519bf..9c96a0a8d1bd 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -49,7 +49,8 @@ static __inline unsigned int apic_read(unsigned long reg)
 
 static __inline__ void apic_wait_icr_idle(void)
 {
-	while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY );
+	while (apic_read( APIC_ICR ) & APIC_ICR_BUSY)
+		cpu_relax();
 }
 
 static inline void ack_APIC_irq(void)
-- 
cgit v1.2.3


From 45486f81c9aa07218b73a38cbcf62ffa66e99088 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 26 Jun 2006 13:59:41 +0200
Subject: [PATCH] x86_64: Standardize i386/x86_64 handling of NMI_VECTOR

x86_64 and i386 behave inconsistently when sending an IPI on vector 2
(NMI_VECTOR).  Make both behave the same, so IPI 2 is sent as NMI.

The crash code was abusing send_IPI_allbutself() by passing a code
instead of a vector, it only worked because crash knew about the
internal code of send_IPI_allbutself().  Change crash to use NMI_VECTOR
instead, and remove the comment about how crash was abusing the function.

This patch is a pre-requisite for fixing the problem where sending an
IPI as NMI would reboot some Dell Xeon systems.  I cannot fix that
problem while crash continus to abuse send_IPI_allbutself().

It also removes the inconsistency between i386 and x86_64 for
NMI_VECTOR.  That will simplify all the RAS code that needs to bring
all the cpus to a clean stop, even when one or more cpus are spinning
disabled.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c   |  7 +------
 arch/i386/kernel/smp.c     | 12 +++++++++++-
 arch/x86_64/kernel/crash.c |  2 +-
 include/asm-i386/hw_irq.h  |  2 ++
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 21dc1bbb8067..0c88d3ec8c18 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -120,14 +120,9 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	return 1;
 }
 
-/*
- * By using the NMI code instead of a vector we just sneak thru the
- * word generator coming out with just what we want.  AND it does
- * not matter if clustered_apic_mode is set or not.
- */
 static void smp_send_nmi_allbutself(void)
 {
-	send_IPI_allbutself(APIC_DM_NMI);
+	send_IPI_allbutself(NMI_VECTOR);
 }
 
 static void nmi_shootdown_cpus(void)
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index d134e9643a58..c10789d7a9d3 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m
 
 static inline int __prepare_ICR (unsigned int shortcut, int vector)
 {
-	return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
+	unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+	switch (vector) {
+	default:
+		icr |= APIC_DM_FIXED | vector;
+		break;
+	case NMI_VECTOR:
+		icr |= APIC_DM_NMI;
+		break;
+	}
+	return icr;
 }
 
 static inline int __prepare_ICR2 (unsigned int mask)
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index ec1c7431d5af..8ca04912b1cc 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -118,7 +118,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 
 static void smp_send_nmi_allbutself(void)
 {
-	send_IPI_allbutself(APIC_DM_NMI);
+	send_IPI_allbutself(NMI_VECTOR);
 }
 
 /*
diff --git a/include/asm-i386/hw_irq.h b/include/asm-i386/hw_irq.h
index 95d3fd090298..a4c0a5a9ffd8 100644
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -19,6 +19,8 @@
 
 struct hw_interrupt_type;
 
+#define NMI_VECTOR		0x02
+
 /*
  * Various low-level irq details needed by irq.c, process.c,
  * time.c, io_apic.c and smp.c
-- 
cgit v1.2.3


From e77deacb7b078156fcadf27b838a4ce1a65eda04 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 26 Jun 2006 13:59:56 +0200
Subject: [PATCH] x86_64: Avoid broadcasting NMI IPIs

On some i386/x86_64 systems, sending an NMI IPI as a broadcast will
reset the system.  This seems to be a BIOS bug which affects machines
where one or more cpus are not under OS control.  It occurs on HT
systems with a version of the OS that is not compiled without HT
support.  It also occurs when a system is booted with max_cpus=n where
2 <= n < cpus known to the BIOS.  The fix is to always send NMI IPI as
a mask instead of as a broadcast.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c        | 25 ++++++++++++++++---------
 include/asm-i386/mach-default/mach_ipi.h |  7 +++++--
 2 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index c66ca7b1d31a..21c7066e236a 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -78,22 +78,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 
 static void flat_send_IPI_allbutself(int vector)
 {
-#ifndef CONFIG_HOTPLUG_CPU
-	if (((num_online_cpus()) - 1) >= 1)
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+#ifdef	CONFIG_HOTPLUG_CPU
+	int hotplug = 1;
 #else
-	cpumask_t allbutme = cpu_online_map;
+	int hotplug = 0;
+#endif
+	if (hotplug || vector == NMI_VECTOR) {
+		cpumask_t allbutme = cpu_online_map;
 
-	cpu_clear(smp_processor_id(), allbutme);
+		cpu_clear(smp_processor_id(), allbutme);
 
-	if (!cpus_empty(allbutme))
-		flat_send_IPI_mask(allbutme, vector);
-#endif
+		if (!cpus_empty(allbutme))
+			flat_send_IPI_mask(allbutme, vector);
+	} else if (num_online_cpus() > 1) {
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+	}
 }
 
 static void flat_send_IPI_all(int vector)
 {
-	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+	if (vector == NMI_VECTOR)
+		flat_send_IPI_mask(cpu_online_map, vector);
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
 
 static int flat_apic_id_registered(void)
diff --git a/include/asm-i386/mach-default/mach_ipi.h b/include/asm-i386/mach-default/mach_ipi.h
index a1d0072e36bc..0dba244c86db 100644
--- a/include/asm-i386/mach-default/mach_ipi.h
+++ b/include/asm-i386/mach-default/mach_ipi.h
@@ -1,6 +1,9 @@
 #ifndef __ASM_MACH_IPI_H
 #define __ASM_MACH_IPI_H
 
+/* Avoid include hell */
+#define NMI_VECTOR 0x02
+
 void send_IPI_mask_bitmask(cpumask_t mask, int vector);
 void __send_IPI_shortcut(unsigned int shortcut, int vector);
 
@@ -13,7 +16,7 @@ static inline void send_IPI_mask(cpumask_t mask, int vector)
 
 static inline void __local_send_IPI_allbutself(int vector)
 {
-	if (no_broadcast) {
+	if (no_broadcast || vector == NMI_VECTOR) {
 		cpumask_t mask = cpu_online_map;
 
 		cpu_clear(smp_processor_id(), mask);
@@ -24,7 +27,7 @@ static inline void __local_send_IPI_allbutself(int vector)
 
 static inline void __local_send_IPI_all(int vector)
 {
-	if (no_broadcast)
+	if (no_broadcast || vector == NMI_VECTOR)
 		send_IPI_mask(cpu_online_map, vector);
 	else
 		__send_IPI_shortcut(APIC_DEST_ALLINC, vector);
-- 
cgit v1.2.3


From 0080e667550db5ae8c9318181500c413b99ff164 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Mon, 26 Jun 2006 13:59:59 +0200
Subject: [PATCH] x86_64: i386/x86-64 Add nmi watchdog support for new Intel
 CPUs

Intel now has support for Architectural Performance Monitoring Counters
( Refer to IA-32 Intel Architecture Software Developer's Manual
http://www.intel.com/design/pentium4/manuals/253669.htm ). This
feature is present starting from Intel Core Duo and Intel Core Solo processors.

What this means is, the performance monitoring counters and some performance
monitoring events are now defined in an architectural way (using cpuid).
And there will be no need to check for family/model etc for these architectural
events.

Below is the patch to use this performance counters in nmi watchdog driver.
Patch handles both i386 and x86-64 kernels.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel.c            |  6 +++
 arch/i386/kernel/nmi.c                  | 65 +++++++++++++++++++++++++-
 arch/x86_64/kernel/nmi.c                | 81 +++++++++++++++++++++++++++++++--
 arch/x86_64/kernel/setup.c              |  7 +++
 include/asm-i386/cpufeature.h           |  1 +
 include/asm-i386/intel_arch_perfmon.h   | 19 ++++++++
 include/asm-x86_64/cpufeature.h         |  2 +-
 include/asm-x86_64/intel_arch_perfmon.h | 19 ++++++++
 8 files changed, 193 insertions(+), 7 deletions(-)
 create mode 100644 include/asm-i386/intel_arch_perfmon.h
 create mode 100644 include/asm-x86_64/intel_arch_perfmon.h

(limited to 'include')

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 5386b29bb5a5..10afc645c540 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -122,6 +122,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	select_idle_routine(c);
 	l2 = init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9 ) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
+	}
 
 	/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
 	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index bd3875419630..a76e93146585 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -24,6 +24,7 @@
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
+#include <asm/intel_arch_perfmon.h>
 
 #include "mach_traps.h"
 
@@ -95,6 +96,9 @@ int nmi_active;
 	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
 	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -207,6 +211,8 @@ static int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
+static void disable_intel_arch_watchdog(void);
+
 static void disable_lapic_nmi_watchdog(void)
 {
 	if (nmi_active <= 0)
@@ -216,6 +222,10 @@ static void disable_lapic_nmi_watchdog(void)
 		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
 		break;
 	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			disable_intel_arch_watchdog();
+			break;
+		}
 		switch (boot_cpu_data.x86) {
 		case 6:
 			if (boot_cpu_data.x86_model > 0xd)
@@ -444,6 +454,53 @@ static int setup_p4_watchdog(void)
 	return 1;
 }
 
+static void disable_intel_arch_watchdog(void)
+{
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+}
+
+static int setup_intel_arch_watchdog(void)
+{
+	unsigned int evntsel;
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return 0;
+
+	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+
+	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
+	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+
+	evntsel = ARCH_PERFMON_EVENTSEL_INT
+		| ARCH_PERFMON_EVENTSEL_OS
+		| ARCH_PERFMON_EVENTSEL_USR
+		| ARCH_PERFMON_NMI_EVENT_SEL
+		| ARCH_PERFMON_NMI_EVENT_UMASK;
+
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	write_watchdog_counter("INTEL_ARCH_PERFCTR0");
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	return 1;
+}
+
 void setup_apic_nmi_watchdog (void)
 {
 	switch (boot_cpu_data.x86_vendor) {
@@ -453,6 +510,11 @@ void setup_apic_nmi_watchdog (void)
 		setup_k7_watchdog();
 		break;
 	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			if (!setup_intel_arch_watchdog())
+				return;
+			break;
+		}
 		switch (boot_cpu_data.x86) {
 		case 6:
 			if (boot_cpu_data.x86_model > 0xd)
@@ -556,7 +618,8 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
 			apic_write(APIC_LVTPC, APIC_DM_NMI);
 		}
-		else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) {
+		else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 ||
+		         nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
 			/* Only P6 based Pentium M need to re-unmask
 			 * the apic vector but it doesn't hurt
 			 * other P6 variant */
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index ab421e22fa67..399489c93132 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -27,6 +27,7 @@
 #include <asm/proto.h>
 #include <asm/kdebug.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 
 /*
  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -66,6 +67,9 @@ static unsigned int nmi_p4_cccr_val;
 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
 
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
 #define MSR_P4_MISC_ENABLE	0x1A0
 #define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
 #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12)
@@ -97,7 +101,10 @@ static __cpuinit inline int nmi_known_cpu(void)
 	case X86_VENDOR_AMD:
 		return boot_cpu_data.x86 == 15;
 	case X86_VENDOR_INTEL:
-		return boot_cpu_data.x86 == 15;
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return 1;
+		else
+			return (boot_cpu_data.x86 == 15);
 	}
 	return 0;
 }
@@ -203,6 +210,8 @@ int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
+static void disable_intel_arch_watchdog(void);
+
 static void disable_lapic_nmi_watchdog(void)
 {
 	if (nmi_active <= 0)
@@ -215,6 +224,8 @@ static void disable_lapic_nmi_watchdog(void)
 		if (boot_cpu_data.x86 == 15) {
 			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
 			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+		} else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			disable_intel_arch_watchdog();
 		}
 		break;
 	}
@@ -367,6 +378,53 @@ static void setup_k7_watchdog(void)
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
+static void disable_intel_arch_watchdog(void)
+{
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+}
+
+static int setup_intel_arch_watchdog(void)
+{
+	unsigned int evntsel;
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return 0;
+
+	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+
+	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
+	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+
+	evntsel = ARCH_PERFMON_EVENTSEL_INT
+		| ARCH_PERFMON_EVENTSEL_OS
+		| ARCH_PERFMON_EVENTSEL_USR
+		| ARCH_PERFMON_NMI_EVENT_SEL
+		| ARCH_PERFMON_NMI_EVENT_UMASK;
+
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	return 1;
+}
+
 
 static int setup_p4_watchdog(void)
 {
@@ -420,10 +478,16 @@ void setup_apic_nmi_watchdog(void)
 		setup_k7_watchdog();
 		break;
 	case X86_VENDOR_INTEL:
-		if (boot_cpu_data.x86 != 15)
-			return;
-		if (!setup_p4_watchdog())
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			if (!setup_intel_arch_watchdog())
+				return;
+		} else if (boot_cpu_data.x86 == 15) {
+			if (!setup_p4_watchdog())
+				return;
+		} else {
 			return;
+		}
+
 		break;
 
 	default:
@@ -508,7 +572,14 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
  			 */
  			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
  			apic_write(APIC_LVTPC, APIC_DM_NMI);
- 		}
+ 		} else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+			/*
+			 * For Intel based architectural perfmon
+			 * - LVTPC is masked on interrupt and must be
+			 *   unmasked by the LVTPC handler.
+			 */
+			apic_write(APIC_LVTPC, APIC_DM_NMI);
+		}
 		wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
 	}
 }
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2a5fce0fd1c4..594642a6c037 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -988,6 +988,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	unsigned n;
 
 	init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9 ) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+	}
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 3ecedbafa8ce..d314ebb3d59e 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -72,6 +72,7 @@
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
 #define X86_FEATURE_UP		(3*32+ 9) /* smp kernel running on up */
 #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-i386/intel_arch_perfmon.h b/include/asm-i386/intel_arch_perfmon.h
new file mode 100644
index 000000000000..134ea9cc5283
--- /dev/null
+++ b/include/asm-i386/intel_arch_perfmon.h
@@ -0,0 +1,19 @@
+#ifndef X86_INTEL_ARCH_PERFMON_H
+#define X86_INTEL_ARCH_PERFMON_H 1
+
+#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE      (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT          (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS           (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR          (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0)
+
+#endif	/* X86_INTEL_ARCH_PERFMON_H */
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index fbf5a19edbb3..ee792faaca01 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -67,7 +67,7 @@
 #define X86_FEATURE_SYNC_RDTSC  (3*32+6)  /* RDTSC syncs CPU core */
 #define X86_FEATURE_FXSAVE_LEAK (3*32+7)  /* FIP/FOP/FDP leaks through FXSAVE */
 #define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
-
+#define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h
new file mode 100644
index 000000000000..59c396431569
--- /dev/null
+++ b/include/asm-x86_64/intel_arch_perfmon.h
@@ -0,0 +1,19 @@
+#ifndef X86_64_INTEL_ARCH_PERFMON_H
+#define X86_64_INTEL_ARCH_PERFMON_H 1
+
+#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE      (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT          (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS           (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR          (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0)
+
+#endif	/* X86_64_INTEL_ARCH_PERFMON_H */
-- 
cgit v1.2.3


From 8501a2fbe762b21d2504ed3aca3b52be61b5e6e4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Mon, 26 Jun 2006 14:00:08 +0200
Subject: [PATCH] x86_64: x86_64 stack usage debugging

Applies to git & 2.6.17-rc6 after CONFIG_DEBUG_STACKOVERFLOW patch

uses same stack-zeroing mechanism as on i386 to discover maximum stack
excursions.

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig.debug        |  9 +++++++++
 include/asm-x86_64/thread_info.h | 13 +++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 087a04868b25..1d92ab56c0f9 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -42,6 +42,15 @@ config DEBUG_STACKOVERFLOW
 	  This option will cause messages to be printed if free stack space
 	  drops below a certain limit.
 
+config DEBUG_STACK_USAGE
+        bool "Stack utilization instrumentation"
+        depends on DEBUG_KERNEL
+        help
+	  Enables the display of the minimum amount of free stack which each
+	  task has ever had available in the sysrq-T and sysrq-P debug output.
+
+	  This option will slow down process creation somewhat.
+
 #config X86_REMOTE_DEBUG
 #       bool "kgdb debugging stub"
 
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index b5e88216fd80..2029b00351f3 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -73,8 +73,21 @@ static inline struct thread_info *stack_thread_info(void)
 }
 
 /* thread information allocation */
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#define alloc_thread_info(tsk)					\
+    ({								\
+	struct thread_info *ret;				\
+								\
+	ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)); \
+	if (ret)						\
+		memset(ret, 0, THREAD_SIZE);			\
+	ret;							\
+    })
+#else
 #define alloc_thread_info(tsk) \
 	((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
+#endif
+
 #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
 
 #else /* !__ASSEMBLY__ */
-- 
cgit v1.2.3