From c1d2f1bccf4259384e581b937e694ee8a350fe55 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 6 Feb 2012 13:28:55 -0500 Subject: x86/microcode: Remove noisy AMD microcode warning AMD processors will never support /dev/cpu/microcode updating so just silently fail instead of printing out a warning for every cpu. Signed-off-by: Prarit Bhargava Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1328552935-965-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ac0417be9131..73465aab28f8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -360,7 +360,6 @@ out: static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } -- cgit v1.2.3 From 32c3233885eb10ac9cb9410f2f8cd64b8df2b2a1 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 8 Feb 2012 20:52:29 +0100 Subject: x86/amd: Fix L1i and L2 cache sharing information for AMD family 15h processors For L1 instruction cache and L2 cache the shared CPU information is wrong. On current AMD family 15h CPUs those caches are shared between both cores of a compute unit. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=42607 Signed-off-by: Andreas Herrmann Cc: Petkov Borislav Cc: Dave Jones Cc: Link: http://lkml.kernel.org/r/20120208195229.GA17523@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 44 ++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e7a901..73d08ed98a64 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { int node; @@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { - struct _cpuid4_info *this_leaf, *sibling_leaf; - unsigned long num_threads_sharing; - int index_msb, i, sibling; + struct _cpuid4_info *this_leaf; + int ret, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { + ret = 0; + if (index == 3) { + ret = 1; for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; @@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) set_bit(sibling, this_leaf->shared_cpu_map); } } - return; + } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { + ret = 1; + for_each_cpu(i, cpu_sibling_mask(cpu)) { + if (!per_cpu(ici_cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } } + + return ret; +} + +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (cache_shared_amd_cpu_map_setup(cpu, index)) + return; + } + this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; -- cgit v1.2.3 From 45d5a1683c04be28abdf5c04c27b1417e0374486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 19 Feb 2012 16:43:37 -0500 Subject: x86/nmi: Test saved %cs in NMI to determine nested NMI case Currently, the NMI handler tests if it is nested by checking the special variable saved on the stack (set during NMI handling) and whether the saved stack is the NMI stack as well (to prevent the race when the variable is set to zero). But userspace may set their %rsp to any value as long as they do not derefence it, and it may make it point to the NMI stack, which will prevent NMIs from triggering while the userspace app is running. (I tested this, and it is indeed the case) Add another check to determine nested NMIs by looking at the saved %cs (code segment register) and making sure that it is the kernel code segment. Signed-off-by: Steven Rostedt Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1329687817.1561.27.camel@acer.local.home Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..debd851de6ff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1531,6 +1531,13 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmp $__KERNEL_CS, 16(%rsp) + jne first_nmi + /* * Check the special variable on the stack to see if NMIs are * executing. -- cgit v1.2.3 From 416d7214741daba3acd6d328289858390bef37bc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 10 Feb 2012 09:24:08 -0500 Subject: xen/setup: Remove redundant filtering of PTE masks. commit 7347b4082e55ac4a673f06a0a0ce25c37273c9ec "xen: Allow unprivileged Xen domains to create iomap pages" added a redundant line in the early bootup code to filter out the PTE. That filtering is already done a bit earlier so this extra processing is not required. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12eb07bfb267..7c44e1bf981e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1204,10 +1204,6 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; - if (!xen_initial_domain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - - __supported_pte_mask |= _PAGE_IOMAP; /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; -- cgit v1.2.3 From 8eaffa67b43e99ae581622c5133e20b0f48bcef1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 10 Feb 2012 09:16:27 -0500 Subject: xen/pat: Disable PAT support for now. [Pls also look at https://lkml.org/lkml/2012/2/10/228] Using of PAT to change pages from WB to WC works quite nicely. Changing it back to WB - not so much. The crux of the matter is that the code that does this (__page_change_att_set_clr) has only limited information so when it tries to the change it gets the "raw" unfiltered information instead of the properly filtered one - and the "raw" one tell it that PSE bit is on (while infact it is not). As a result when the PTE is set to be WB from WC, we get tons of: :WARNING: at arch/x86/xen/mmu.c:475 xen_make_pte+0x67/0xa0() :Hardware name: HP xw4400 Workstation .. snip.. :Pid: 27, comm: kswapd0 Tainted: G W 3.2.2-1.fc16.x86_64 #1 :Call Trace: : [] warn_slowpath_common+0x7f/0xc0 : [] warn_slowpath_null+0x1a/0x20 : [] xen_make_pte+0x67/0xa0 : [] __raw_callee_save_xen_make_pte+0x11/0x1e : [] ? __change_page_attr_set_clr+0x9d5/0xc00 : [] ? __purge_vmap_area_lazy+0x158/0x1d0 : [] ? vm_unmap_aliases+0x175/0x190 : [] change_page_attr_set_clr+0x128/0x4c0 : [] set_pages_array_wb+0x42/0xa0 : [] ? check_events+0x12/0x20 : [] ttm_pages_put+0x1c/0x70 [ttm] : [] ttm_page_pool_free+0xf8/0x180 [ttm] : [] ttm_pool_mm_shrink+0x58/0x90 [ttm] : [] shrink_slab+0x154/0x310 : [] balance_pgdat+0x4fa/0x6c0 : [] kswapd+0x178/0x3d0 : [] ? __schedule+0x3d4/0x8c0 : [] ? remove_wait_queue+0x50/0x50 : [] ? balance_pgdat+0x6c0/0x6c0 : [] kthread+0x8c/0xa0 for every page. The proper fix for this is has been posted and is https://lkml.org/lkml/2012/2/10/228 "x86/cpa: Use pte_attrs instead of pte_flags on CPA/set_p.._wb/wc operations." along with a detailed description of the problem and solution. But since that posting has gone nowhere I am proposing this band-aid solution so that at least users don't get the page corruption (the pages that are WC don't get changed to WB and end up being recycled for filesystem or other things causing mysterious crashes). The negative impact of this patch is that users of WC flag (which are InfiniBand, radeon, nouveau drivers) won't be able to set that flag - so they are going to see performance degradation. But stability is more important here. Fixes RH BZ# 742032, 787403, and 745574 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7c44e1bf981e..4172af8ceeb3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1141,7 +1141,9 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; +#if 0 if (!xen_initial_domain()) +#endif __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); __supported_pte_mask |= _PAGE_IOMAP; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 58a0e46c404d..95c1cf60c669 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -415,13 +415,13 @@ static pteval_t iomap_pte(pteval_t val) static pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; - +#if 0 /* If this is a WC pte, convert back from Xen WC to Linux WC */ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { WARN_ON(!pat_enabled); pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; } - +#endif if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) return pteval; @@ -463,7 +463,7 @@ void xen_set_pat(u64 pat) static pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); - +#if 0 /* If Linux is trying to set a WC pte, then map to the Xen WC. * If _PAGE_PAT is set, then it probably means it is really * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope @@ -476,7 +476,7 @@ static pte_t xen_make_pte(pteval_t pte) if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; } - +#endif /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA -- cgit v1.2.3 From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 10:24:09 -0800 Subject: i387: fix up some fpu_counter confusion This makes sure we clear the FPU usage counter for newly created tasks, just so that we start off in a known state (for example, don't try to preload the FPU state on the first task switch etc). It also fixes a thinko in when we increment the fpu_counter at task switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU state preloading at context switch time"). We should increment the *new* task fpu_counter, not the old task, and only if we decide to use that state (whether lazily or preloaded). Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 3 ++- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a850b4d8d14d..8df95849721d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (__save_init_fpu(old)) fpu_lazy_state_intact(old); __thread_clear_has_fpu(old); - old->fpu_counter++; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { + new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else @@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta } else { old->fpu_counter = 0; if (fpu.preload) { + new->fpu_counter++; if (fpu_lazy_restore(new)) fpu.preload = 0; else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 80bfe1ab0031..bc32761bc27a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1fd94bc4279d..8ad880b3bc1c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); -- cgit v1.2.3 From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 11:48:44 -0800 Subject: i387: use 'restore_fpu_checking()' directly in task switching code This inlines what is usually just a couple of instructions, but more importantly it also fixes the theoretical error case (can that FPU restore really ever fail? Maybe we should remove the checking). We can't start sending signals from within the scheduler, we're much too deep in the kernel and are holding the runqueue lock etc. So don't bother even trying. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 17 ++++++++++++++--- arch/x86/kernel/traps.c | 40 ++++++++-------------------------------- 2 files changed, 22 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 8df95849721d..74c607b37e87 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,7 +29,6 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -269,6 +268,16 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.has_fpu)); + return fpu_restore_checking(&tsk->thread.fpu); } @@ -378,8 +387,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta */ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (fpu.preload) - __math_state_restore(new); + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 77da5b475ad2..4bbe04d96744 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,37 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -/* - * This gets called with the process already owning the - * FPU state, and with CR0.TS cleared. It just needs to - * restore the FPU register state. - */ -void __math_state_restore(struct task_struct *tsk) -{ - /* We need a safe address that is cheap to find and that is already - in L1. We've just brought in "tsk->thread.has_fpu", so use that */ -#define safe_address (tsk->thread.has_fpu) - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); - force_sig(SIGSEGV, tsk); - return; - } -} - /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -631,7 +600,14 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); - __math_state_restore(tsk); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(tsk); + force_sig(SIGSEGV, tsk); + return; + } tsk->fpu_counter++; } -- cgit v1.2.3 From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 13:27:00 -0800 Subject: i387: support lazy restore of FPU state This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 35 +++++++++++++++++++++++------------ arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "emms\n\t" /* clear stack tags */ "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.has_fpu)); + [addr] "m" (tsk->thread.fpu.has_fpu)); return fpu_restore_checking(&tsk->thread.fpu); } @@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) */ static inline int __thread_has_fpu(struct task_struct *tsk) { - return tsk->thread.has_fpu; + return tsk->thread.fpu.has_fpu; } /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 1; + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); } /* @@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; * We don't do that yet, so "fpu_lazy_restore()" always returns * false, but some day.. */ -#define fpu_lazy_restore(tsk) (0) -#define fpu_lazy_state_intact(tsk) do { } while (0) +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; if (__thread_has_fpu(old)) { - if (__save_init_fpu(old)) - fpu_lazy_state_intact(old); - __thread_clear_has_fpu(old); + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; @@ -454,7 +456,6 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ - unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..b667148dfad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bc32761bc27a..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ad880b3bc1c..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu; - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0, LDT and the page table pointer: -- cgit v1.2.3 From a38449ef596b345e13a8f9b7d5cd9fedb8fcf921 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Feb 2012 15:29:34 -0500 Subject: x86: Specify a size for the cmp in the NMI handler Linus noticed that the cmp used to check if the code segment is __KERNEL_CS or not did not specify a size. Perhaps it does not matter as H. Peter Anvin noted that user space can not set the bottom two bits of the %cs register. But it's best not to let the assembly choose and change things between different versions of gas, but instead just pick the size. Four bytes are used to compare the saved code segment against __KERNEL_CS. Perhaps this might mess up Xen, but we can fix that when the time comes. Also I noticed that there was another non-specified cmp that checks the special stack variable if it is 1 or 0. This too probably doesn't matter what cmp is used, but this patch uses cmpl just to make it non ambiguous. Link: http://lkml.kernel.org/r/CA+55aFxfAn9MWRgS3O5k2tqN5ys1XrhSFVO5_9ZAoZKDVgNfGA@mail.gmail.com Suggested-by: Linus Torvalds Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index debd851de6ff..1333d9851778 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1535,14 +1535,14 @@ ENTRY(nmi) * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmp $__KERNEL_CS, 16(%rsp) + cmpl $__KERNEL_CS, 16(%rsp) jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmp $1, -8(%rsp) + cmpl $1, -8(%rsp) je nested_nmi /* -- cgit v1.2.3 From 27e74da9800289e69ba907777df1e2085231eff7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 19:34:10 -0800 Subject: i387: export 'fpu_owner_task' per-cpu variable (And define it properly for x86-32, which had its 'current_task' declaration in separate from x86-64) Bitten by my dislike for modules on the machines I use, and the fact that apparently nobody else actually wanted to test the patches I sent out. Snif. Nobody else cares. Anyway, we probably should uninline the 'kernel_fpu_begin()' function that is what modules actually use and that references this, but this is the minimal fix for now. Reported-by: Josh Boyer Reported-and-tested-by: Jongman Heo Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b667148dfad7..c0f7d68d318f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,6 +1045,7 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1113,6 +1114,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); -- cgit v1.2.3 From 3f806e50981825fa56a7f1938f24c0680816be45 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Feb 2012 20:18:01 +0100 Subject: x86/mce/AMD: Fix UP build error 141168c36cde ("x86: Simplify code by removing a !SMP #ifdefs from 'struct cpuinfo_x86'") removed a bunch of CONFIG_SMP ifdefs around code touching struct cpuinfo_x86 members but also caused the following build error with Randy's randconfigs: mce_amd.c:(.cpuinit.text+0x4723): undefined reference to `cpu_llc_shared_map' Restore the #ifdef in threshold_create_bank() which creates symlinks on the non-BSP CPUs. There's a better patch series being worked on by Kevin Winchester which will solve this in a cleaner fashion, but that series is too ambitious for v3.3 merging - so we first queue up this trivial fix and then do the rest for v3.4. Signed-off-by: Borislav Petkov Acked-by: Kevin Winchester Cc: Randy Dunlap Cc: Nick Bowler Link: http://lkml.kernel.org/r/20120203191801.GA2846@x1.osrc.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..e4eeaaf58a47 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } +#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { -- cgit v1.2.3 From 1cc1c96c1658bfaf85d06d764bd7ac00640ae90f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 20 Feb 2012 17:23:47 -0800 Subject: PCI: fix memleak when ACPI _CRS is not used. warning: unreferenced object 0xffff8801f6914200 (size 512): comm "swapper/0", pid 1, jiffies 4294893643 (age 2664.644s) hex dump (first 32 bytes): 00 00 c0 fe 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 60 58 2f f6 03 88 ff ff 00 02 00 00 00 00 00 00 `X/............. backtrace: [] kmemleak_alloc+0x26/0x43 [] __kmalloc+0x121/0x183 [] get_current_resources+0x5a/0xc6 [] pci_acpi_scan_root+0x13c/0x21c [] acpi_pci_root_add+0x1e1/0x421 [] acpi_device_probe+0x50/0x190 [] really_probe+0x99/0x126 [] driver_probe_device+0x3b/0x56 [] __driver_attach+0x5f/0x82 [] bus_for_each_dev+0x5c/0x88 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xca/0x21d [] driver_register+0x91/0xfe [] acpi_bus_register_driver+0x43/0x45 [] acpi_pci_root_init+0x20/0x28 [] do_one_initcall+0x57/0x134 The system has _CRS for root buses, but they are not used because the machine date is before the cutoff date for _CRS usage. Try to free those unused resource arrays and names. Reviewed-by: Bjorn Helgaas Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a312e76063a7..c33e0970ee9f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -282,9 +282,6 @@ static void add_resources(struct pci_root_info *info) int i; struct resource *res, *root, *conflict; - if (!pci_use_crs) - return; - coalesce_windows(info, IORESOURCE_MEM); coalesce_windows(info, IORESOURCE_IO); @@ -336,8 +333,13 @@ get_current_resources(struct acpi_device *device, int busnum, acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - add_resources(&info); - return; + if (pci_use_crs) { + add_resources(&info); + + return; + } + + kfree(info.name); name_alloc_fail: kfree(info.res); -- cgit v1.2.3 From 8411371709610c826bf65684f886bfdfb5780ca1 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Feb 2012 11:51:10 -0700 Subject: x86/PCI: use host bridge _CRS info on MSI MS-7253 In the spirit of commit 29cf7a30f8a0 ("x86/PCI: use host bridge _CRS info on ASUS M2V-MX SE"), this DMI quirk turns on "pci_use_crs" by default on a board that needs it. This fixes boot failures and oopses introduced in 3e3da00c01d0 ("x86/pci: AMD one chain system to use pci read out res"). The quirk is quite targetted (to a specific board and BIOS version) for two reasons: (1) to emphasize that this method of tackling the problem one quirk at a time is a little insane (2) to give BIOS vendors an opportunity to use simpler tables and allow us to return to generic behavior (whatever that happens to be) with a later BIOS update In other words, I am not at all happy with having quirks like this. But it is even worse for the kernel not to work out of the box on these machines, so... Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42619 Reported-by: Svante Signell Signed-off-by: Jonathan Nieder Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c33e0970ee9f..7034c081b226 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -60,6 +60,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=42619 */ + { + .callback = set_use_crs, + .ident = "MSI MS-7253", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), + DMI_MATCH(DMI_BOARD_NAME, "MS-7253"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.6"), + }, + }, /* Now for the blacklist.. */ -- cgit v1.2.3 From a97f4f5e524bcd09a85ef0b8821a14d35e69335f Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Feb 2012 15:31:35 -0600 Subject: x86/PCI: do not tie MSI MS-7253 use_crs quirk to BIOS version Carlos was getting WARNING: at drivers/pci/pci.c:118 pci_ioremap_bar+0x24/0x52() when probing his sound card, and sound did not work. After adding pci=use_crs to the kernel command line, no more trouble. Ok, we can add a quirk. dmidecode output reveals that this is an MSI MS-7253, for which we already have a quirk, but the short-sighted author tied the quirk to a single BIOS version, making it not kick in on Carlos's machine with BIOS V1.2. If a later BIOS update makes it no longer necessary to look at the _CRS info it will still be harmless, so let's stop trying to guess which versions have and don't have accurate _CRS tables. Addresses https://bugtrack.alsa-project.org/alsa-bug/view.php?id=5533 Also see . Reported-by: Carlos Luna Reviewed-by: Bjorn Helgaas Signed-off-by: Jonathan Nieder Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 7034c081b226..49a5cb55429b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -68,7 +68,6 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), DMI_MATCH(DMI_BOARD_NAME, "MS-7253"), DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "V1.6"), }, }, -- cgit v1.2.3 From 1018faa6cf23b256bf25919ef203cd7c129f06f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Feb 2012 14:57:32 +0100 Subject: perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled It turned out that a performance counter on AMD does not count at all when the GO or HO bit is set in the control register and SVM is disabled in EFER. This patch works around this issue by masking out the HO bit in the performance counter control register when SVM is not enabled. The GO bit is not touched because it is only set when the user wants to count in guest-mode only. So when SVM is disabled the counter should not run at all and the not-counting is the intended behaviour. Signed-off-by: Joerg Roedel Signed-off-by: Peter Zijlstra Cc: Avi Kivity Cc: Stephane Eranian Cc: David Ahern Cc: Gleb Natapov Cc: Robert Richter Cc: stable@vger.kernel.org # v3.2 Link: http://lkml.kernel.org/r/1330523852-19566-1-git-send-email-joerg.roedel@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 8 ++++++++ arch/x86/kernel/cpu/perf_event.h | 8 ++++++-- arch/x86/kernel/cpu/perf_event_amd.c | 37 ++++++++++++++++++++++++++++++++++-- arch/x86/kvm/svm.c | 5 +++++ 4 files changed, 54 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..461ce432b1c2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -242,4 +242,12 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) static inline void perf_events_lapic_init(void) { } #endif +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + extern void amd_pmu_enable_virt(void); + extern void amd_pmu_disable_virt(void); +#else + static inline void amd_pmu_enable_virt(void) { } + static inline void amd_pmu_disable_virt(void) { } +#endif + #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..c30c807ddc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -147,7 +147,9 @@ struct cpu_hw_events { /* * AMD specific bits */ - struct amd_nb *amd_nb; + struct amd_nb *amd_nb; + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ + u64 perf_ctr_virt_mask; void *kfree_on_online; }; @@ -417,9 +419,11 @@ void x86_pmu_disable_all(void); static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); + if (hwc->extra_reg.reg) wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); - wrmsrl(hwc->config_base, hwc->config | enable_mask); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); } void x86_pmu_enable_all(int added); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e9..67250a52430b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu) struct amd_nb *nb; int i, nb_id; - if (boot_cpu_data.x86_max_cores < 2) + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) return; nb_id = amd_get_nb_id(cpu); @@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .put_event_constraints = amd_put_event_constraints, .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, #endif + .cpu_starting = amd_pmu_cpu_starting, }; __init int amd_pmu_init(void) @@ -621,3 +624,33 @@ __init int amd_pmu_init(void) return 0; } + +void amd_pmu_enable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe56..e385214711cb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -575,6 +576,8 @@ static void svm_hardware_disable(void *garbage) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); + + amd_pmu_disable_virt(); } static int svm_hardware_enable(void *garbage) @@ -622,6 +625,8 @@ static int svm_hardware_enable(void *garbage) svm_init_erratum_383(); + amd_pmu_enable_virt(); + return 0; } -- cgit v1.2.3 From 6414fa6a150111750011f477899d370244da4171 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Mar 2012 06:38:42 +0000 Subject: aout: move setup_arg_pages() prior to reading/mapping the binary Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 14 +++++++------- fs/binfmt_aout.c | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index fd843877e841..39e49091f648 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -315,6 +315,13 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -410,13 +417,6 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); /* start thread */ diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26ae..1ff94054d35a 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -352,13 +359,6 @@ beyond_if: return retval; } - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); #ifdef __alpha__ -- cgit v1.2.3 From 097d59106a8e4b42d07c9892fdd7790f1659c6ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 18:23:36 -0800 Subject: vm: avoid using find_vma_prev() unnecessarily Several users of "find_vma_prev()" were not in fact interested in the previous vma if there was no primary vma to be found either. And in those cases, we're much better off just using the regular "find_vma()", and then "prev" can be looked up by just checking vma->vm_prev. The find_vma_prev() semantics are fairly subtle (see Mikulas' recent commit 83cd904d271b: "mm: fix find_vma_prev"), and the whole "return prev by reference" means that it generates worse code too. Thus this "let's avoid using this inconvenient and clearly too subtle interface when we don't really have to" patch. Cc: Mikulas Patocka Cc: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 4 +++- mm/mempolicy.c | 3 ++- mm/mlock.c | 3 ++- mm/mprotect.c | 3 ++- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f581a18c0d4d..83e7141c3982 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -333,13 +333,15 @@ try_again: * Lookup failure means no vma is above this address, * i.e. return with success: */ - if (!(vma = find_vma_prev(mm, addr, &prev_vma))) + vma = find_vma(mm, add); + if (!vma) return addr; /* * new region fits between prev_vma->vm_end and * vma->vm_start, use it: */ + prev_vma = vma->vm_prev; if (addr + len <= vma->vm_start && (!prev_vma || (addr >= prev_vma->vm_end))) { /* remember the address as a hint for next time */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 06b145fb64ab..47296fee23db 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmstart; unsigned long vmend; - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma || vma->vm_start > start) return -EFAULT; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65d..ef726e8aa8e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) return -EINVAL; if (end == start) return 0; - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..f437d054c3bf 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, down_write(¤t->mm->mmap_sem); - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; + prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; -- cgit v1.2.3 From 55062d061790b43aee01ab3f9ac57b8596254f19 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 18:48:13 -0800 Subject: x86: fix typo in recent find_vma_prev purge It turns out that test-compiling this file on x86-64 doesn't really help, because much of it is x86-32-specific. And so I hadn't noticed the slightly over-eager removal of the 'r' from 'addr' variable despite thinking I had tested it. Signed-off-by: Linus "oopsie" Torvalds --- arch/x86/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 83e7141c3982..8ecbb4bba4b3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -333,7 +333,7 @@ try_again: * Lookup failure means no vma is above this address, * i.e. return with success: */ - vma = find_vma(mm, add); + vma = find_vma(mm, addr); if (!vma) return addr; -- cgit v1.2.3 From a7f4255f906f60f72e00aad2fb000939449ff32e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Mar 2012 20:55:10 +0100 Subject: x86: Derandom delay_tsc for 64 bit Commit f0fbf0abc093 ("x86: integrate delay functions") converted delay_tsc() into a random delay generator for 64 bit. The reason is that it merged the mostly identical versions of delay_32.c and delay_64.c. Though the subtle difference of the result was: static void delay_tsc(unsigned long loops) { - unsigned bclock, now; + unsigned long bclock, now; Now the function uses rdtscl() which returns the lower 32bit of the TSC. On 32bit that's not problematic as unsigned long is 32bit. On 64 bit this fails when the lower 32bit are close to wrap around when bclock is read, because the following check if ((now - bclock) >= loops) break; evaluated to true on 64bit for e.g. bclock = 0xffffffff and now = 0 because the unsigned long (now - bclock) of these values results in 0xffffffff00000001 which is definitely larger than the loops value. That explains Tvortkos observation: "Because I am seeing udelay(500) (_occasionally_) being short, and that by delaying for some duration between 0us (yep) and 491us." Make those variables explicitely u32 again, so this works for both 32 and 64 bit. Reported-by: Tvrtko Ursulin Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org # >= 2.6.27 Signed-off-by: Linus Torvalds --- arch/x86/lib/delay.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index fc45ba887d05..e395693abdb1 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -48,9 +48,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static void delay_tsc(unsigned long __loops) { - unsigned long bclock, now; + u32 bclock, now, loops = __loops; int cpu; preempt_disable(); -- cgit v1.2.3