From c1d2f1bccf4259384e581b937e694ee8a350fe55 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 6 Feb 2012 13:28:55 -0500
Subject: x86/microcode: Remove noisy AMD microcode warning

AMD processors will never support /dev/cpu/microcode updating so
just silently fail instead of printing out a warning for every
cpu.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1328552935-965-1-git-send-email-prarit@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ac0417be9131..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -360,7 +360,6 @@ out:
 static enum ucode_state
 request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-	pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
 	return UCODE_ERROR;
 }
 
-- 
cgit v1.2.3


From 32c3233885eb10ac9cb9410f2f8cd64b8df2b2a1 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Wed, 8 Feb 2012 20:52:29 +0100
Subject: x86/amd: Fix L1i and L2 cache sharing information for AMD family 15h
 processors

For L1 instruction cache and L2 cache the shared CPU information
is wrong. On current AMD family 15h CPUs those caches are shared
between both cores of a compute unit.

This fixes https://bugzilla.kernel.org/show_bug.cgi?id=42607

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Petkov Borislav <Borislav.Petkov@amd.com>
Cc: Dave Jones <davej@redhat.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20120208195229.GA17523@alberich.amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 44 ++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a901..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
-					int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
 {
 	int node;
 
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 
 #ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+
+static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
 {
-	struct _cpuid4_info	*this_leaf, *sibling_leaf;
-	unsigned long num_threads_sharing;
-	int index_msb, i, sibling;
+	struct _cpuid4_info *this_leaf;
+	int ret, i, sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+	ret = 0;
+	if (index == 3) {
+		ret = 1;
 		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
 			if (!per_cpu(ici_cpuid4_info, i))
 				continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 				set_bit(sibling, this_leaf->shared_cpu_map);
 			}
 		}
-		return;
+	} else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
+		ret = 1;
+		for_each_cpu(i, cpu_sibling_mask(cpu)) {
+			if (!per_cpu(ici_cpuid4_info, i))
+				continue;
+			this_leaf = CPUID4_INFO_IDX(i, index);
+			for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+				if (!cpu_online(sibling))
+					continue;
+				set_bit(sibling, this_leaf->shared_cpu_map);
+			}
+		}
 	}
+
+	return ret;
+}
+
+static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+{
+	struct _cpuid4_info *this_leaf, *sibling_leaf;
+	unsigned long num_threads_sharing;
+	int index_msb, i;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		if (cache_shared_amd_cpu_map_setup(cpu, index))
+			return;
+	}
+
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
 	num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
 
-- 
cgit v1.2.3


From 45d5a1683c04be28abdf5c04c27b1417e0374486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 19 Feb 2012 16:43:37 -0500
Subject: x86/nmi: Test saved %cs in NMI to determine nested NMI case

Currently, the NMI handler tests if it is nested by checking the
special variable saved on the stack (set during NMI handling)
and whether the saved stack is the NMI stack as well (to prevent
the race when the variable is set to zero).

But userspace may set their %rsp to any value as long as they do
not derefence it, and it may make it point to the NMI stack,
which will prevent NMIs from triggering while the userspace app
is running. (I tested this, and it is indeed the case)

Add another check to determine nested NMIs by looking at the
saved %cs (code segment register) and making sure that it is the
kernel code segment.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/1329687817.1561.27.camel@acer.local.home
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..debd851de6ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1531,6 +1531,13 @@ ENTRY(nmi)
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
 
+	/*
+	 * If %cs was not the kernel segment, then the NMI triggered in user
+	 * space, which means it is definitely not nested.
+	 */
+	cmp $__KERNEL_CS, 16(%rsp)
+	jne first_nmi
+
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.
-- 
cgit v1.2.3


From 416d7214741daba3acd6d328289858390bef37bc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Feb 2012 09:24:08 -0500
Subject: xen/setup: Remove redundant filtering of PTE masks.

commit 7347b4082e55ac4a673f06a0a0ce25c37273c9ec "xen: Allow
unprivileged Xen domains to create iomap pages" added a redundant
line in the early bootup code to filter out the PTE. That
filtering is already done a bit earlier so this extra processing
is not required.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12eb07bfb267..7c44e1bf981e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1204,10 +1204,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
-	if (!xen_initial_domain())
-		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
-	__supported_pte_mask |= _PAGE_IOMAP;
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
cgit v1.2.3


From 8eaffa67b43e99ae581622c5133e20b0f48bcef1 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Feb 2012 09:16:27 -0500
Subject: xen/pat: Disable PAT support for now.

[Pls also look at https://lkml.org/lkml/2012/2/10/228]

Using of PAT to change pages from WB to WC works quite nicely.
Changing it back to WB - not so much. The crux of the matter is
that the code that does this (__page_change_att_set_clr) has only
limited information so when it tries to the change it gets
the "raw" unfiltered information instead of the properly filtered one -
and the "raw" one tell it that PSE bit is on (while infact it
is not).  As a result when the PTE is set to be WB from WC, we get
tons of:

:WARNING: at arch/x86/xen/mmu.c:475 xen_make_pte+0x67/0xa0()
:Hardware name: HP xw4400 Workstation
.. snip..
:Pid: 27, comm: kswapd0 Tainted: G        W    3.2.2-1.fc16.x86_64 #1
:Call Trace:
: [<ffffffff8106dd1f>] warn_slowpath_common+0x7f/0xc0
: [<ffffffff8106dd7a>] warn_slowpath_null+0x1a/0x20
: [<ffffffff81005a17>] xen_make_pte+0x67/0xa0
: [<ffffffff810051bd>] __raw_callee_save_xen_make_pte+0x11/0x1e
: [<ffffffff81040e15>] ? __change_page_attr_set_clr+0x9d5/0xc00
: [<ffffffff8114c2e8>] ? __purge_vmap_area_lazy+0x158/0x1d0
: [<ffffffff8114cca5>] ? vm_unmap_aliases+0x175/0x190
: [<ffffffff81041168>] change_page_attr_set_clr+0x128/0x4c0
: [<ffffffff81041542>] set_pages_array_wb+0x42/0xa0
: [<ffffffff8100a9b2>] ? check_events+0x12/0x20
: [<ffffffffa0074d4c>] ttm_pages_put+0x1c/0x70 [ttm]
: [<ffffffffa0074e98>] ttm_page_pool_free+0xf8/0x180 [ttm]
: [<ffffffffa0074f78>] ttm_pool_mm_shrink+0x58/0x90 [ttm]
: [<ffffffff8112ba04>] shrink_slab+0x154/0x310
: [<ffffffff8112f17a>] balance_pgdat+0x4fa/0x6c0
: [<ffffffff8112f4b8>] kswapd+0x178/0x3d0
: [<ffffffff815df134>] ? __schedule+0x3d4/0x8c0
: [<ffffffff81090410>] ? remove_wait_queue+0x50/0x50
: [<ffffffff8112f340>] ? balance_pgdat+0x6c0/0x6c0
: [<ffffffff8108fb6c>] kthread+0x8c/0xa0

for every page. The proper fix for this is has been posted
and is https://lkml.org/lkml/2012/2/10/228
"x86/cpa: Use pte_attrs instead of pte_flags on CPA/set_p.._wb/wc operations."
along with a detailed description of the problem and solution.

But since that posting has gone nowhere I am proposing
this band-aid solution so that at least users don't get
the page corruption (the pages that are WC don't get changed to WB
and end up being recycled for filesystem or other things causing
mysterious crashes).

The negative impact of this patch is that users of WC flag
(which are InfiniBand, radeon, nouveau drivers) won't be able
to set that flag - so they are going to see performance degradation.
But stability is more important here.

Fixes RH BZ# 742032, 787403, and 745574
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 2 ++
 arch/x86/xen/mmu.c       | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7c44e1bf981e..4172af8ceeb3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1141,7 +1141,9 @@ asmlinkage void __init xen_start_kernel(void)
 
 	/* Prevent unwanted bits from being set in PTEs. */
 	__supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
 	if (!xen_initial_domain())
+#endif
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
 	__supported_pte_mask |= _PAGE_IOMAP;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 58a0e46c404d..95c1cf60c669 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -415,13 +415,13 @@ static pteval_t iomap_pte(pteval_t val)
 static pteval_t xen_pte_val(pte_t pte)
 {
 	pteval_t pteval = pte.pte;
-
+#if 0
 	/* If this is a WC pte, convert back from Xen WC to Linux WC */
 	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 		WARN_ON(!pat_enabled);
 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 	}
-
+#endif
 	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 		return pteval;
 
@@ -463,7 +463,7 @@ void xen_set_pat(u64 pat)
 static pte_t xen_make_pte(pteval_t pte)
 {
 	phys_addr_t addr = (pte & PTE_PFN_MASK);
-
+#if 0
 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 	 * If _PAGE_PAT is set, then it probably means it is really
 	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
@@ -476,7 +476,7 @@ static pte_t xen_make_pte(pteval_t pte)
 		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 	}
-
+#endif
 	/*
 	 * Unprivileged domains are allowed to do IOMAPpings for
 	 * PCI passthrough, but not map ISA space.  The ISA
-- 
cgit v1.2.3


From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 20 Feb 2012 10:24:09 -0800
Subject: i387: fix up some fpu_counter confusion

This makes sure we clear the FPU usage counter for newly created tasks,
just so that we start off in a known state (for example, don't try to
preload the FPU state on the first task switch etc).

It also fixes a thinko in when we increment the fpu_counter at task
switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU
state preloading at context switch time").  We should increment the
*new* task fpu_counter, not the old task, and only if we decide to use
that state (whether lazily or preloaded).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h  | 3 ++-
 arch/x86/kernel/process_32.c | 1 +
 arch/x86/kernel/process_64.c | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a850b4d8d14d..8df95849721d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 		if (__save_init_fpu(old))
 			fpu_lazy_state_intact(old);
 		__thread_clear_has_fpu(old);
-		old->fpu_counter++;
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
+			new->fpu_counter++;
 			__thread_set_has_fpu(new);
 			prefetch(new->thread.fpu.state);
 		} else
@@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 	} else {
 		old->fpu_counter = 0;
 		if (fpu.preload) {
+			new->fpu_counter++;
 			if (fpu_lazy_restore(new))
 				fpu.preload = 0;
 			else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 80bfe1ab0031..bc32761bc27a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
 	err = -ENOMEM;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1fd94bc4279d..8ad880b3bc1c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	set_tsk_thread_flag(p, TIF_FORK);
 
+	p->fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
-- 
cgit v1.2.3


From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 11:48:44 -0800
Subject: i387: use 'restore_fpu_checking()' directly in task switching code

This inlines what is usually just a couple of instructions, but more
importantly it also fixes the theoretical error case (can that FPU
restore really ever fail? Maybe we should remove the checking).

We can't start sending signals from within the scheduler, we're much too
deep in the kernel and are holding the runqueue lock etc.  So don't
bother even trying.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 17 ++++++++++++++---
 arch/x86/kernel/traps.c     | 40 ++++++++--------------------------------
 2 files changed, 22 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8df95849721d..74c607b37e87 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -29,7 +29,6 @@ extern unsigned int sig_xstate_size;
 extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
-extern void __math_state_restore(struct task_struct *);
 extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
@@ -269,6 +268,16 @@ static inline int fpu_restore_checking(struct fpu *fpu)
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. "m" is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (tsk->thread.has_fpu));
+
 	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
@@ -378,8 +387,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
  */
 static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
 {
-	if (fpu.preload)
-		__math_state_restore(new);
+	if (fpu.preload) {
+		if (unlikely(restore_fpu_checking(new)))
+			__thread_fpu_end(new);
+	}
 }
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 77da5b475ad2..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,37 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
-/*
- * This gets called with the process already owning the
- * FPU state, and with CR0.TS cleared. It just needs to
- * restore the FPU register state.
- */
-void __math_state_restore(struct task_struct *tsk)
-{
-	/* We need a safe address that is cheap to find and that is already
-	   in L1. We've just brought in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(tsk);
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
-}
-
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -631,7 +600,14 @@ void math_state_restore(void)
 	}
 
 	__thread_fpu_begin(tsk);
-	__math_state_restore(tsk);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
 
 	tsk->fpu_counter++;
 }
-- 
cgit v1.2.3


From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 13:27:00 -0800
Subject: i387: support lazy restore of FPU state

This makes us recognize when we try to restore FPU state that matches
what we already have in the FPU on this CPU, and avoids the restore
entirely if so.

To do this, we add two new data fields:

 - a percpu 'fpu_owner_task' variable that gets written any time we
   update the "has_fpu" field, and thus acts as a kind of back-pointer
   to the task that owns the CPU.  The exception is when we save the FPU
   state as part of a context switch - if the save can keep the FPU
   state around, we leave the 'fpu_owner_task' variable pointing at the
   task whose FP state still remains on the CPU.

 - a per-thread 'last_cpu' field, that indicates which CPU that thread
   used its FPU on last.  We update this on every context switch
   (writing an invalid CPU number if the last context switch didn't
   leave the FPU in a lazily usable state), so we know that *that*
   thread has done nothing else with the FPU since.

These two fields together can be used when next switching back to the
task to see if the CPU still matches: if 'fpu_owner_task' matches the
task we are switching to, we know that no other task (or kernel FPU
usage) touched the FPU on this CPU in the meantime, and if the current
CPU number matches the 'last_cpu' field, we know that this thread did no
other FP work on any other CPU, so the FPU state on the CPU must match
what was saved on last context switch.

In that case, we can avoid the 'f[x]rstor' entirely, and just clear the
CR0.TS bit.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h      | 35 +++++++++++++++++++++++------------
 arch/x86/include/asm/processor.h |  3 ++-
 arch/x86/kernel/cpu/common.c     |  2 ++
 arch/x86/kernel/process_32.c     |  2 +-
 arch/x86/kernel/process_64.c     |  2 +-
 5 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 74c607b37e87..247904945d3f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child);
 extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
+DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
+
 extern user_regset_active_fn fpregs_active, xfpregs_active;
 extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
 				xstateregs_get;
@@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
 		"emms\n\t"	  	/* clear stack tags */
 		"fildl %P[addr]",	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (tsk->thread.has_fpu));
+		[addr] "m" (tsk->thread.fpu.has_fpu));
 
 	return fpu_restore_checking(&tsk->thread.fpu);
 }
@@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
  */
 static inline int __thread_has_fpu(struct task_struct *tsk)
 {
-	return tsk->thread.has_fpu;
+	return tsk->thread.fpu.has_fpu;
 }
 
 /* Must be paired with an 'stts' after! */
 static inline void __thread_clear_has_fpu(struct task_struct *tsk)
 {
-	tsk->thread.has_fpu = 0;
+	tsk->thread.fpu.has_fpu = 0;
+	percpu_write(fpu_owner_task, NULL);
 }
 
 /* Must be paired with a 'clts' before! */
 static inline void __thread_set_has_fpu(struct task_struct *tsk)
 {
-	tsk->thread.has_fpu = 1;
+	tsk->thread.fpu.has_fpu = 1;
+	percpu_write(fpu_owner_task, tsk);
 }
 
 /*
@@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t;
  * We don't do that yet, so "fpu_lazy_restore()" always returns
  * false, but some day..
  */
-#define fpu_lazy_restore(tsk) (0)
-#define fpu_lazy_state_intact(tsk) do { } while (0)
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+	return new == percpu_read_stable(fpu_owner_task) &&
+		cpu == new->thread.fpu.last_cpu;
+}
 
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new)
+static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
 {
 	fpu_switch_t fpu;
 
 	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
 	if (__thread_has_fpu(old)) {
-		if (__save_init_fpu(old))
-			fpu_lazy_state_intact(old);
-		__thread_clear_has_fpu(old);
+		if (!__save_init_fpu(old))
+			cpu = ~0;
+		old->thread.fpu.last_cpu = cpu;
+		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
@@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 			stts();
 	} else {
 		old->fpu_counter = 0;
+		old->thread.fpu.last_cpu = ~0;
 		if (fpu.preload) {
 			new->fpu_counter++;
-			if (fpu_lazy_restore(new))
+			if (fpu_lazy_restore(new, cpu))
 				fpu.preload = 0;
 			else
 				prefetch(new->thread.fpu.state);
@@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void)
 		__save_init_fpu(me);
 		__thread_clear_has_fpu(me);
 		/* We do 'stts()' in kernel_fpu_end() */
-	} else
+	} else {
+		percpu_write(fpu_owner_task, NULL);
 		clts();
+	}
 }
 
 static inline void kernel_fpu_end(void)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f7c89e231c6c..58545c97d071 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -374,6 +374,8 @@ union thread_xstate {
 };
 
 struct fpu {
+	unsigned int last_cpu;
+	unsigned int has_fpu;
 	union thread_xstate *state;
 };
 
@@ -454,7 +456,6 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
-	unsigned long		has_fpu;
 	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..b667148dfad7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+
 /*
  * Special IST stacks which the CPU switches to when it calls
  * an IST-marked descriptor entry. Up to 7 stacks (hardware
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bc32761bc27a..c08d1ff12b7c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu = switch_fpu_prepare(prev_p, next_p);
+	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8ad880b3bc1c..cfa5c90c01db 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	unsigned fsindex, gsindex;
 	fpu_switch_t fpu;
 
-	fpu = switch_fpu_prepare(prev_p, next_p);
+	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
-- 
cgit v1.2.3


From a38449ef596b345e13a8f9b7d5cd9fedb8fcf921 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Feb 2012 15:29:34 -0500
Subject: x86: Specify a size for the cmp in the NMI handler

Linus noticed that the cmp used to check if the code segment is
__KERNEL_CS or not did not specify a size. Perhaps it does not matter
as H. Peter Anvin noted that user space can not set the bottom two
bits of the %cs register. But it's best not to let the assembly choose
and change things between different versions of gas, but instead just
pick the size.

Four bytes are used to compare the saved code segment against
__KERNEL_CS. Perhaps this might mess up Xen, but we can fix that when
the time comes.

Also I noticed that there was another non-specified cmp that checks
the special stack variable if it is 1 or 0. This too probably doesn't
matter what cmp is used, but this patch uses cmpl just to make it non
ambiguous.

Link: http://lkml.kernel.org/r/CA+55aFxfAn9MWRgS3O5k2tqN5ys1XrhSFVO5_9ZAoZKDVgNfGA@mail.gmail.com

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index debd851de6ff..1333d9851778 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1535,14 +1535,14 @@ ENTRY(nmi)
 	 * If %cs was not the kernel segment, then the NMI triggered in user
 	 * space, which means it is definitely not nested.
 	 */
-	cmp $__KERNEL_CS, 16(%rsp)
+	cmpl $__KERNEL_CS, 16(%rsp)
 	jne first_nmi
 
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.
 	 */
-	cmp $1, -8(%rsp)
+	cmpl $1, -8(%rsp)
 	je nested_nmi
 
 	/*
-- 
cgit v1.2.3


From 27e74da9800289e69ba907777df1e2085231eff7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 20 Feb 2012 19:34:10 -0800
Subject: i387: export 'fpu_owner_task' per-cpu variable

(And define it properly for x86-32, which had its 'current_task'
declaration in separate from x86-64)

Bitten by my dislike for modules on the machines I use, and the fact
that apparently nobody else actually wanted to test the patches I sent
out.

Snif. Nobody else cares.

Anyway, we probably should uninline the 'kernel_fpu_begin()' function
that is what modules actually use and that references this, but this is
the minimal fix for now.

Reported-by: Josh Boyer <jwboyer@gmail.com>
Reported-and-tested-by: Jongman Heo <jongman.heo@samsung.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/common.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b667148dfad7..c0f7d68d318f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1045,6 +1045,7 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 /*
  * Special IST stacks which the CPU switches to when it calls
@@ -1113,6 +1114,8 @@ void debug_stack_reset(void)
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
-- 
cgit v1.2.3


From 3f806e50981825fa56a7f1938f24c0680816be45 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Fri, 3 Feb 2012 20:18:01 +0100
Subject: x86/mce/AMD: Fix UP build error

141168c36cde ("x86: Simplify code by removing a !SMP #ifdefs
from 'struct cpuinfo_x86'") removed a bunch of CONFIG_SMP ifdefs
around code touching struct cpuinfo_x86 members but also caused
the following build error with Randy's randconfigs:

mce_amd.c:(.cpuinit.text+0x4723): undefined reference to `cpu_llc_shared_map'

Restore the #ifdef in threshold_create_bank() which creates
symlinks on the non-BSP CPUs.

There's a better patch series being worked on by Kevin Winchester
which will solve this in a cleaner fashion, but that series is
too ambitious for v3.3 merging - so we first queue up this trivial
fix and then do the rest for v3.4.

Signed-off-by: Borislav Petkov <bp@alien8.de>
Acked-by: Kevin Winchester <kjwinchester@gmail.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Nick Bowler <nbowler@elliptictech.com>
Link: http://lkml.kernel.org/r/20120203191801.GA2846@x1.osrc.amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..e4eeaaf58a47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 	sprintf(name, "threshold_bank%i", bank);
 
+#ifdef CONFIG_SMP
 	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
 		i = cpumask_first(cpu_llc_shared_mask(cpu));
 
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 		goto out;
 	}
+#endif
 
 	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
 	if (!b) {
-- 
cgit v1.2.3


From 1cc1c96c1658bfaf85d06d764bd7ac00640ae90f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 20 Feb 2012 17:23:47 -0800
Subject: PCI: fix memleak when ACPI _CRS is not used.

warning:
unreferenced object 0xffff8801f6914200 (size 512):
  comm "swapper/0", pid 1, jiffies 4294893643 (age 2664.644s)
  hex dump (first 32 bytes):
    00 00 c0 fe 00 00 00 00 ff ff ff ff 00 00 00 00  ................
    60 58 2f f6 03 88 ff ff 00 02 00 00 00 00 00 00  `X/.............
  backtrace:
    [<ffffffff81c2408c>] kmemleak_alloc+0x26/0x43
    [<ffffffff8113764f>] __kmalloc+0x121/0x183
    [<ffffffff81ca8d93>] get_current_resources+0x5a/0xc6
    [<ffffffff81c5bedd>] pci_acpi_scan_root+0x13c/0x21c
    [<ffffffff81c2a745>] acpi_pci_root_add+0x1e1/0x421
    [<ffffffff81408f50>] acpi_device_probe+0x50/0x190
    [<ffffffff8149edc7>] really_probe+0x99/0x126
    [<ffffffff8149ef83>] driver_probe_device+0x3b/0x56
    [<ffffffff8149effd>] __driver_attach+0x5f/0x82
    [<ffffffff8149d860>] bus_for_each_dev+0x5c/0x88
    [<ffffffff8149eb87>] driver_attach+0x1e/0x20
    [<ffffffff8149e7cc>] bus_add_driver+0xca/0x21d
    [<ffffffff8149f47b>] driver_register+0x91/0xfe
    [<ffffffff81409d09>] acpi_bus_register_driver+0x43/0x45
    [<ffffffff8278bdc9>] acpi_pci_root_init+0x20/0x28
    [<ffffffff810001e7>] do_one_initcall+0x57/0x134

The system has _CRS for root buses, but they are not used because the machine
date is before the cutoff date for _CRS usage.

Try to free those unused resource arrays and names.

Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a312e76063a7..c33e0970ee9f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -282,9 +282,6 @@ static void add_resources(struct pci_root_info *info)
 	int i;
 	struct resource *res, *root, *conflict;
 
-	if (!pci_use_crs)
-		return;
-
 	coalesce_windows(info, IORESOURCE_MEM);
 	coalesce_windows(info, IORESOURCE_IO);
 
@@ -336,8 +333,13 @@ get_current_resources(struct acpi_device *device, int busnum,
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				&info);
 
-	add_resources(&info);
-	return;
+	if (pci_use_crs) {
+		add_resources(&info);
+
+		return;
+	}
+
+	kfree(info.name);
 
 name_alloc_fail:
 	kfree(info.res);
-- 
cgit v1.2.3


From 8411371709610c826bf65684f886bfdfb5780ca1 Mon Sep 17 00:00:00 2001
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Tue, 28 Feb 2012 11:51:10 -0700
Subject: x86/PCI: use host bridge _CRS info on MSI MS-7253

In the spirit of commit 29cf7a30f8a0 ("x86/PCI: use host bridge _CRS
info on ASUS M2V-MX SE"), this DMI quirk turns on "pci_use_crs" by
default on a board that needs it.

This fixes boot failures and oopses introduced in 3e3da00c01d0
("x86/pci: AMD one chain system to use pci read out res").  The quirk
is quite targetted (to a specific board and BIOS version) for two
reasons:

 (1) to emphasize that this method of tackling the problem one quirk
     at a time is a little insane

 (2) to give BIOS vendors an opportunity to use simpler tables and
     allow us to return to generic behavior (whatever that happens to
     be) with a later BIOS update

In other words, I am not at all happy with having quirks like this.
But it is even worse for the kernel not to work out of the box on
these machines, so...

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42619
Reported-by: Svante Signell <svante.signell@telia.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index c33e0970ee9f..7034c081b226 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -60,6 +60,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
 		},
 	},
+	/* https://bugzilla.kernel.org/show_bug.cgi?id=42619 */
+	{
+		.callback = set_use_crs,
+		.ident = "MSI MS-7253",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"),
+			DMI_MATCH(DMI_BOARD_NAME, "MS-7253"),
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "V1.6"),
+		},
+	},
 
 	/* Now for the blacklist.. */
 
-- 
cgit v1.2.3


From a97f4f5e524bcd09a85ef0b8821a14d35e69335f Mon Sep 17 00:00:00 2001
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Tue, 28 Feb 2012 15:31:35 -0600
Subject: x86/PCI: do not tie MSI MS-7253 use_crs quirk to BIOS version

Carlos was getting

	WARNING: at drivers/pci/pci.c:118 pci_ioremap_bar+0x24/0x52()

when probing his sound card, and sound did not work.  After adding
pci=use_crs to the kernel command line, no more trouble.

Ok, we can add a quirk.  dmidecode output reveals that this is an MSI
MS-7253, for which we already have a quirk, but the short-sighted
author tied the quirk to a single BIOS version, making it not kick in
on Carlos's machine with BIOS V1.2.  If a later BIOS update makes it
no longer necessary to look at the _CRS info it will still be
harmless, so let's stop trying to guess which versions have and don't
have accurate _CRS tables.

Addresses https://bugtrack.alsa-project.org/alsa-bug/view.php?id=5533
Also see <https://bugzilla.kernel.org/show_bug.cgi?id=42619>.

Reported-by: Carlos Luna <caralu74@gmail.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 7034c081b226..49a5cb55429b 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -68,7 +68,6 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"),
 			DMI_MATCH(DMI_BOARD_NAME, "MS-7253"),
 			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
-			DMI_MATCH(DMI_BIOS_VERSION, "V1.6"),
 		},
 	},
 
-- 
cgit v1.2.3


From 1018faa6cf23b256bf25919ef203cd7c129f06f2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 29 Feb 2012 14:57:32 +0100
Subject: perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled

It turned out that a performance counter on AMD does not
count at all when the GO or HO bit is set in the control
register and SVM is disabled in EFER.

This patch works around this issue by masking out the HO bit
in the performance counter control register when SVM is not
enabled.

The GO bit is not touched because it is only set when the
user wants to count in guest-mode only. So when SVM is
disabled the counter should not run at all and the
not-counting is the intended behaviour.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Avi Kivity <avi@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: stable@vger.kernel.org # v3.2
Link: http://lkml.kernel.org/r/1330523852-19566-1-git-send-email-joerg.roedel@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h    |  8 ++++++++
 arch/x86/kernel/cpu/perf_event.h     |  8 ++++++--
 arch/x86/kernel/cpu/perf_event_amd.c | 37 ++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/svm.c                   |  5 +++++
 4 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 096c975e099f..461ce432b1c2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -242,4 +242,12 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 static inline void perf_events_lapic_init(void)	{ }
 #endif
 
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+ extern void amd_pmu_enable_virt(void);
+ extern void amd_pmu_disable_virt(void);
+#else
+ static inline void amd_pmu_enable_virt(void) { }
+ static inline void amd_pmu_disable_virt(void) { }
+#endif
+
 #endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..c30c807ddc72 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -147,7 +147,9 @@ struct cpu_hw_events {
 	/*
 	 * AMD specific bits
 	 */
-	struct amd_nb		*amd_nb;
+	struct amd_nb			*amd_nb;
+	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+	u64				perf_ctr_virt_mask;
 
 	void				*kfree_on_online;
 };
@@ -417,9 +419,11 @@ void x86_pmu_disable_all(void);
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
+	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
 	if (hwc->extra_reg.reg)
 		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-	wrmsrl(hwc->config_base, hwc->config | enable_mask);
+	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
 }
 
 void x86_pmu_enable_all(int added);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e9..67250a52430b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
 #include <linux/perf_event.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu)
 	struct amd_nb *nb;
 	int i, nb_id;
 
-	if (boot_cpu_data.x86_max_cores < 2)
+	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+
+	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
 		return;
 
 	nb_id = amd_get_nb_id(cpu);
@@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
 	.put_event_constraints	= amd_put_event_constraints,
 
 	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_starting		= amd_pmu_cpu_starting,
 	.cpu_dead		= amd_pmu_cpu_dead,
 #endif
+	.cpu_starting		= amd_pmu_cpu_starting,
 };
 
 __init int amd_pmu_init(void)
@@ -621,3 +624,33 @@ __init int amd_pmu_init(void)
 
 	return 0;
 }
+
+void amd_pmu_enable_virt(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->perf_ctr_virt_mask = 0;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	/*
+	 * We only mask out the Host-only bit so that host-only counting works
+	 * when SVM is disabled. If someone sets up a guest-only counter when
+	 * SVM is disabled the Guest-only bits still gets set and the counter
+	 * will not count anything.
+	 */
+	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fa553babe56..e385214711cb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -29,6 +29,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 
+#include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/kvm_para.h>
@@ -575,6 +576,8 @@ static void svm_hardware_disable(void *garbage)
 		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 
 	cpu_svm_disable();
+
+	amd_pmu_disable_virt();
 }
 
 static int svm_hardware_enable(void *garbage)
@@ -622,6 +625,8 @@ static int svm_hardware_enable(void *garbage)
 
 	svm_init_erratum_383();
 
+	amd_pmu_enable_virt();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 6414fa6a150111750011f477899d370244da4171 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 5 Mar 2012 06:38:42 +0000
Subject: aout: move setup_arg_pages() prior to reading/mapping the binary

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c | 14 +++++++-------
 fs/binfmt_aout.c          | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index fd843877e841..39e49091f648 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -315,6 +315,13 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
+	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) {
+		/* Someone check-me: is this error path enough? */
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
+
 	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 
@@ -410,13 +417,6 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) {
-		/* Someone check-me: is this error path enough? */
-		send_sig(SIGKILL, current, 0);
-		return retval;
-	}
-
 	current->mm->start_stack =
 		(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
 	/* start thread */
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a6395bdb26ae..1ff94054d35a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
+	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) {
+		/* Someone check-me: is this error path enough? */
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
+
 	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 
@@ -352,13 +359,6 @@ beyond_if:
 		return retval;
 	}
 
-	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) { 
-		/* Someone check-me: is this error path enough? */ 
-		send_sig(SIGKILL, current, 0); 
-		return retval;
-	}
-
 	current->mm->start_stack =
 		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
 #ifdef __alpha__
-- 
cgit v1.2.3


From 097d59106a8e4b42d07c9892fdd7790f1659c6ff Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 Mar 2012 18:23:36 -0800
Subject: vm: avoid using find_vma_prev() unnecessarily

Several users of "find_vma_prev()" were not in fact interested in the
previous vma if there was no primary vma to be found either.  And in
those cases, we're much better off just using the regular "find_vma()",
and then "prev" can be looked up by just checking vma->vm_prev.

The find_vma_prev() semantics are fairly subtle (see Mikulas' recent
commit 83cd904d271b: "mm: fix find_vma_prev"), and the whole "return
prev by reference" means that it generates worse code too.

Thus this "let's avoid using this inconvenient and clearly too subtle
interface when we don't really have to" patch.

Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 4 +++-
 mm/mempolicy.c            | 3 ++-
 mm/mlock.c                | 3 ++-
 mm/mprotect.c             | 3 ++-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f581a18c0d4d..83e7141c3982 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -333,13 +333,15 @@ try_again:
 		 * Lookup failure means no vma is above this address,
 		 * i.e. return with success:
 		 */
-		if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+		vma = find_vma(mm, add);
+		if (!vma)
 			return addr;
 
 		/*
 		 * new region fits between prev_vma->vm_end and
 		 * vma->vm_start, use it:
 		 */
+		prev_vma = vma->vm_prev;
 		if (addr + len <= vma->vm_start &&
 		            (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 06b145fb64ab..47296fee23db 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 	unsigned long vmstart;
 	unsigned long vmend;
 
-	vma = find_vma_prev(mm, start, &prev);
+	vma = find_vma(mm, start);
 	if (!vma || vma->vm_start > start)
 		return -EFAULT;
 
+	prev = vma->vm_prev;
 	if (start > vma->vm_start)
 		prev = vma;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f4f53bdc65d..ef726e8aa8e9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
 		return -EINVAL;
 	if (end == start)
 		return 0;
-	vma = find_vma_prev(current->mm, start, &prev);
+	vma = find_vma(current->mm, start);
 	if (!vma || vma->vm_start > start)
 		return -ENOMEM;
 
+	prev = vma->vm_prev;
 	if (start > vma->vm_start)
 		prev = vma;
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756be..f437d054c3bf 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 
 	down_write(&current->mm->mmap_sem);
 
-	vma = find_vma_prev(current->mm, start, &prev);
+	vma = find_vma(current->mm, start);
 	error = -ENOMEM;
 	if (!vma)
 		goto out;
+	prev = vma->vm_prev;
 	if (unlikely(grows & PROT_GROWSDOWN)) {
 		if (vma->vm_start >= end)
 			goto out;
-- 
cgit v1.2.3


From 55062d061790b43aee01ab3f9ac57b8596254f19 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 Mar 2012 18:48:13 -0800
Subject: x86: fix typo in recent find_vma_prev purge

It turns out that test-compiling this file on x86-64 doesn't really
help, because much of it is x86-32-specific.  And so I hadn't noticed
the slightly over-eager removal of the 'r' from 'addr' variable despite
thinking I had tested it.

Signed-off-by: Linus "oopsie" Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 83e7141c3982..8ecbb4bba4b3 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -333,7 +333,7 @@ try_again:
 		 * Lookup failure means no vma is above this address,
 		 * i.e. return with success:
 		 */
-		vma = find_vma(mm, add);
+		vma = find_vma(mm, addr);
 		if (!vma)
 			return addr;
 
-- 
cgit v1.2.3


From a7f4255f906f60f72e00aad2fb000939449ff32e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Mar 2012 20:55:10 +0100
Subject: x86: Derandom delay_tsc for 64 bit

Commit f0fbf0abc093 ("x86: integrate delay functions") converted
delay_tsc() into a random delay generator for 64 bit.  The reason is
that it merged the mostly identical versions of delay_32.c and
delay_64.c.  Though the subtle difference of the result was:

 static void delay_tsc(unsigned long loops)
 {
-	unsigned bclock, now;
+	unsigned long bclock, now;

Now the function uses rdtscl() which returns the lower 32bit of the
TSC. On 32bit that's not problematic as unsigned long is 32bit. On 64
bit this fails when the lower 32bit are close to wrap around when
bclock is read, because the following check

       if ((now - bclock) >= loops)
       	  	break;

evaluated to true on 64bit for e.g. bclock = 0xffffffff and now = 0
because the unsigned long (now - bclock) of these values results in
0xffffffff00000001 which is definitely larger than the loops
value. That explains Tvortkos observation:

"Because I am seeing udelay(500) (_occasionally_) being short, and
 that by delaying for some duration between 0us (yep) and 491us."

Make those variables explicitely u32 again, so this works for both 32
and 64 bit.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # >= 2.6.27
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/delay.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index fc45ba887d05..e395693abdb1 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -48,9 +48,9 @@ static void delay_loop(unsigned long loops)
 }
 
 /* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static void delay_tsc(unsigned long __loops)
 {
-	unsigned long bclock, now;
+	u32 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
-- 
cgit v1.2.3