From b8989db9d82465bf38a48a4d3ef32e7d8afc4d08 Mon Sep 17 00:00:00 2001 From: Alan Date: Mon, 20 Jan 2014 18:01:56 +0000 Subject: x86, doc, kconfig: Fix dud URL for Microcode data The actual data lives in the Intel download center, and that ought to also be a reliable way to continue to find it. Unfortunately the actual URL needed for doing it directly is about a foot long so give instructions. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20140120180056.7173.62222.stgit@alan.etchedpixels.co.uk Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd60eca..64199bc08d66 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1064,9 +1064,9 @@ config MICROCODE_INTEL This options enables microcode patch loading support for Intel processors. - For latest news and information on obtaining all the required - Intel ingredients for this driver, check: - . + For the current Intel microcode data package go to + and search for + 'Linux Processor Microcode Data File'. config MICROCODE_AMD bool "AMD microcode loading support" -- cgit v1.2.3 From fb53a1ab88d14848dc292842e35c3bda3a665997 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Thu, 23 Jan 2014 16:13:32 -0600 Subject: x86/quirks: Add workaround for AMD F16h Erratum792 The workaround for this Erratum is included in AGESA. But BIOSes spun only after Jan2014 will have the fix (atleast server versions of the chip). The erratum affects both embedded and server platforms and since we cannot say with certainity that ALL BIOSes on systems out in the field will have the fix, we should probably insulate ourselves in case BIOS does not do the right thing or someone is using old BIOSes. Refer to Revision Guide for AMD F16h models 00h-0fh, document 51810 Rev. 3.04, November2013 for details on the Erratum. Tested the patch on Fam16h server platform and it works fine. Signed-off-by: Aravind Gopalakrishnan Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1390515212-1824-1-git-send-email-Aravind.Gopalakrishnan@amd.com [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 04ee1e2e4c02..7c6acd4b8995 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -571,3 +571,40 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, quirk_amd_nb_node); #endif + +#ifdef CONFIG_PCI +/* + * Processor does not ensure DRAM scrub read/write sequence + * is atomic wrt accesses to CC6 save state area. Therefore + * if a concurrent scrub read/write access is to same address + * the entry may appear as if it is not written. This quirk + * applies to Fam16h models 00h-0Fh + * + * See "Revision Guide" for AMD F16h models 00h-0fh, + * document 51810 rev. 3.04, Nov 2013 + */ +static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) +{ + u32 val; + + /* + * Suggested workaround: + * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b + */ + pci_read_config_dword(dev, 0x58, &val); + if (val & 0x1F) { + val &= ~(0x1F); + pci_write_config_dword(dev, 0x58, val); + } + + pci_read_config_dword(dev, 0x5C, &val); + if (val & BIT(0)) { + val &= ~BIT(0); + pci_write_config_dword(dev, 0x5c, val); + } +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, + amd_disable_seq_and_redirect_scrub); + +#endif -- cgit v1.2.3 From 2993ae3305ad10b41e0d0bc2662f7754ee8e30fa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Jan 2014 10:22:09 +0300 Subject: x86/AMD/NB: Fix amd_set_subcaches() parameter type This is under CAP_SYS_ADMIN, but Smatch complains that mask comes from the user and the test for "mask > 0xf" can underflow. The fix is simple: amd_set_subcaches() should hand down not an 'int' but an 'unsigned long' like it was originally indended to do. Signed-off-by: Dan Carpenter Acked-by: Borislav Petkov Cc: Borislav Petkov Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20140121072209.GA22095@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_nb.h | 2 +- arch/x86/kernel/amd_nb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index a54ee1d054d9..aaac3b2fb746 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -19,7 +19,7 @@ extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); extern int amd_get_subcaches(int); -extern int amd_set_subcaches(int, int); +extern int amd_set_subcaches(int, unsigned long); struct amd_l3_cache { unsigned indices; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 59554dca96ec..dec8de4e1663 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -179,7 +179,7 @@ int amd_get_subcaches(int cpu) return (mask >> (4 * cuid)) & 0xf; } -int amd_set_subcaches(int cpu, int mask) +int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); -- cgit v1.2.3 From ec65993443736a5091b68e80ff1734548944a4b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:16 -0800 Subject: mm, x86: Account for TLB flushes only when debugging Bisection between 3.11 and 3.12 fingered commit 9824cf97 ("mm: vmstats: tlb flush counters") to cause overhead problems. The counters are undeniably useful but how often do we really need to debug TLB flush related issues? It does not justify taking the penalty everywhere so make it a debugging option. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Hugh Dickins Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-XzxjntugxuwpxXhcrxqqh53b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 6 +++--- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/mm/tlb.c | 14 +++++++------- include/linux/vm_event_item.h | 4 +++- include/linux/vmstat.h | 8 ++++++++ mm/vmstat.c | 4 +++- 6 files changed, 26 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e6d90babc245..04905bfc508b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) */ static inline void __flush_tlb_up(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); } static inline void flush_tlb_all(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb_all(); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4f..0e25a1bc5ab5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..05446c1cccfe 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -151,7 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -215,7 +215,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* tlb_flushall_shift is on balance point, details in commit log */ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { if (has_large_page(mm, start, end)) { @@ -224,7 +224,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -262,7 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -270,7 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c557c6d096de..3a712e2e7d76 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ -#endif +#endif /* CONFIG_SMP */ NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, +#endif /* CONFIG_DEBUG_TLBFLUSH */ NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4b948080d20..80ebba9c2e87 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_DEBUG_TLBFLUSH +#define count_vm_tlb_event(x) count_vm_event(x) +#define count_vm_tlb_events(x, y) count_vm_events(x, y) +#else +#define count_vm_tlb_event(x) do {} while (0) +#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) +#endif + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; -- cgit v1.2.3 From 15aa368255f249df0b2af630c9487bb5471bd7da Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:18 -0800 Subject: x86/mm: Clean up inconsistencies when flushing TLB ranges NR_TLB_LOCAL_FLUSH_ALL is not always accounted for correctly and the comparison with total_vm is done before taking tlb_flushall_shift into account. Clean it up. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Alex Shi Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Hugh Dickins Link: http://lkml.kernel.org/n/tip-Iz5gcahrgskIldvukulzi0hh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 05446c1cccfe..5176526ddd59 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -189,6 +189,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { unsigned long addr; unsigned act_entries, tlb_entries = 0; + unsigned long nr_base_pages; preempt_disable(); if (current->active_mm != mm) @@ -210,18 +211,17 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, tlb_entries = tlb_lli_4k[ENTRIES]; else tlb_entries = tlb_lld_4k[ENTRIES]; + /* Assume all of TLB entries was occupied by this task */ - act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; + act_entries = tlb_entries >> tlb_flushall_shift; + act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; + nr_base_pages = (end - start) >> PAGE_SHIFT; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + if (nr_base_pages > act_entries || has_large_page(mm, start, end)) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { - if (has_large_page(mm, start, end)) { - local_flush_tlb(); - goto flush_all; - } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); -- cgit v1.2.3 From 71b54f8263860a37dd9f50f81880a9d681fd9c10 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:19 -0800 Subject: x86/mm: Eliminate redundant page table walk during TLB range flushing When choosing between doing an address space or ranged flush, the x86 implementation of flush_tlb_mm_range takes into account whether there are any large pages in the range. A per-page flush typically requires fewer entries than would covered by a single large page and the check is redundant. There is one potential exception. THP migration flushes single THP entries and it conceivably would benefit from flushing a single entry instead of the mm. However, this flush is after a THP allocation, copy and page table update potentially with any other threads serialised behind it. In comparison to that, the flush is noise. It makes more sense to optimise balancing to require fewer flushes than to optimise the flush itself. This patch deletes the redundant huge page check. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-sgei1drpOcburujPsfh6ovmo@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5176526ddd59..dd8dda167a24 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -158,32 +158,6 @@ void flush_tlb_current_task(void) preempt_enable(); } -/* - * It can find out the THP large page, or - * HUGETLB page in tlb_flush when THP disabled - */ -static inline unsigned long has_large_page(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - unsigned long addr = ALIGN(start, HPAGE_SIZE); - for (; addr < end; addr += HPAGE_SIZE) { - pgd = pgd_offset(mm, addr); - if (likely(!pgd_none(*pgd))) { - pud = pud_offset(pgd, addr); - if (likely(!pud_none(*pud))) { - pmd = pmd_offset(pud, addr); - if (likely(!pmd_none(*pmd))) - if (pmd_large(*pmd)) - return addr; - } - } - } - return 0; -} - void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { @@ -218,7 +192,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, nr_base_pages = (end - start) >> PAGE_SHIFT; /* tlb_flushall_shift is on balance point, details in commit log */ - if (nr_base_pages > act_entries || has_large_page(mm, start, end)) { + if (nr_base_pages > act_entries) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { -- cgit v1.2.3 From f98b7a772ab51b52ca4d2a14362fc0e0c8a2e0f3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:21 -0800 Subject: x86: mm: change tlb_flushall_shift for IvyBridge There was a large performance regression that was bisected to commit 611ae8e3 ("x86/tlb: enable tlb flush range support for x86"). This patch simply changes the default balance point between a local and global flush for IvyBridge. In the interest of allowing the tests to be reproduced, this patch was tested using mmtests 0.15 with the following configurations configs/config-global-dhp__tlbflush-performance configs/config-global-dhp__scheduler-performance configs/config-global-dhp__network-performance Results are from two machines Ivybridge 4 threads: Intel(R) Core(TM) i3-3240 CPU @ 3.40GHz Ivybridge 8 threads: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz Page fault microbenchmark showed nothing interesting. Ebizzy was configured to run multiple iterations and threads. Thread counts ranged from 1 to NR_CPUS*2. For each thread count, it ran 100 iterations and each iteration lasted 10 seconds. Ivybridge 4 threads 3.13.0-rc7 3.13.0-rc7 vanilla altshift-v3 Mean 1 6395.44 ( 0.00%) 6789.09 ( 6.16%) Mean 2 7012.85 ( 0.00%) 8052.16 ( 14.82%) Mean 3 6403.04 ( 0.00%) 6973.74 ( 8.91%) Mean 4 6135.32 ( 0.00%) 6582.33 ( 7.29%) Mean 5 6095.69 ( 0.00%) 6526.68 ( 7.07%) Mean 6 6114.33 ( 0.00%) 6416.64 ( 4.94%) Mean 7 6085.10 ( 0.00%) 6448.51 ( 5.97%) Mean 8 6120.62 ( 0.00%) 6462.97 ( 5.59%) Ivybridge 8 threads 3.13.0-rc7 3.13.0-rc7 vanilla altshift-v3 Mean 1 7336.65 ( 0.00%) 7787.02 ( 6.14%) Mean 2 8218.41 ( 0.00%) 9484.13 ( 15.40%) Mean 3 7973.62 ( 0.00%) 8922.01 ( 11.89%) Mean 4 7798.33 ( 0.00%) 8567.03 ( 9.86%) Mean 5 7158.72 ( 0.00%) 8214.23 ( 14.74%) Mean 6 6852.27 ( 0.00%) 7952.45 ( 16.06%) Mean 7 6774.65 ( 0.00%) 7536.35 ( 11.24%) Mean 8 6510.50 ( 0.00%) 6894.05 ( 5.89%) Mean 12 6182.90 ( 0.00%) 6661.29 ( 7.74%) Mean 16 6100.09 ( 0.00%) 6608.69 ( 8.34%) Ebizzy hits the worst case scenario for TLB range flushing every time and it shows for these Ivybridge CPUs at least that the default choice is a poor on. The patch addresses the problem. Next was a tlbflush microbenchmark written by Alex Shi at http://marc.info/?l=linux-kernel&m=133727348217113 . It measures access costs while the TLB is being flushed. The expectation is that if there are always full TLB flushes that the benchmark would suffer and it benefits from range flushing There are 320 iterations of the test per thread count. The number of entries is randomly selected with a min of 1 and max of 512. To ensure a reasonably even spread of entries, the full range is broken up into 8 sections and a random number selected within that section. iteration 1, random number between 0-64 iteration 2, random number between 64-128 etc This is still a very weak methodology. When you do not know what are typical ranges, random is a reasonable choice but it can be easily argued that the opimisation was for smaller ranges and an even spread is not representative of any workload that matters. To improve this, we'd need to know the probability distribution of TLB flush range sizes for a set of workloads that are considered "common", build a synthetic trace and feed that into this benchmark. Even that is not perfect because it would not account for the time between flushes but there are limits of what can be reasonably done and still be doing something useful. If a representative synthetic trace is provided then this benchmark could be revisited and the shift values retuned. Ivybridge 4 threads 3.13.0-rc7 3.13.0-rc7 vanilla altshift-v3 Mean 1 10.50 ( 0.00%) 10.50 ( 0.03%) Mean 2 17.59 ( 0.00%) 17.18 ( 2.34%) Mean 3 22.98 ( 0.00%) 21.74 ( 5.41%) Mean 5 47.13 ( 0.00%) 46.23 ( 1.92%) Mean 8 43.30 ( 0.00%) 42.56 ( 1.72%) Ivybridge 8 threads 3.13.0-rc7 3.13.0-rc7 vanilla altshift-v3 Mean 1 9.45 ( 0.00%) 9.36 ( 0.93%) Mean 2 9.37 ( 0.00%) 9.70 ( -3.54%) Mean 3 9.36 ( 0.00%) 9.29 ( 0.70%) Mean 5 14.49 ( 0.00%) 15.04 ( -3.75%) Mean 8 41.08 ( 0.00%) 38.73 ( 5.71%) Mean 13 32.04 ( 0.00%) 31.24 ( 2.49%) Mean 16 40.05 ( 0.00%) 39.04 ( 2.51%) For both CPUs, average access time is reduced which is good as this is the benchmark that was used to tune the shift values in the first place albeit it is now known *how* the benchmark was used. The scheduler benchmarks were somewhat inconclusive. They showed gains and losses and makes me reconsider how stable those benchmarks really are or if something else might be interfering with the test results recently. Network benchmarks were inconclusive. Almost all results were flat except for netperf-udp tests on the 4 thread machine. These results were unstable and showed large variations between reboots. It is unknown if this is a recent problems but I've noticed before that netperf-udp results tend to vary. Based on these results, changing the default for Ivybridge seems like a logical choice. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Alex Shi Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-cqnadffh1tiqrshthRj3Esge@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b342c026..bbe1b8b1f1c4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -628,7 +628,7 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) tlb_flushall_shift = 5; break; case 0x63a: /* Ivybridge */ - tlb_flushall_shift = 1; + tlb_flushall_shift = 2; break; default: tlb_flushall_shift = 6; -- cgit v1.2.3 From b9a3b4c976c1209957326537ad5c0bb633dfd764 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:22 -0800 Subject: mm, x86: Revisit tlb_flushall_shift tuning for page flushes except on IvyBridge There was a large ebizzy performance regression that was bisected to commit 611ae8e3 (x86/tlb: enable tlb flush range support for x86). The problem was related to the tlb_flushall_shift tuning for IvyBridge which was altered. The problem is that it is not clear if the tuning values for each CPU family is correct as the methodology used to tune the values is unclear. This patch uses a conservative tlb_flushall_shift value for all CPU families except IvyBridge so the decision can be revisited if any regression is found as a result of this change. IvyBridge is an exception as testing with one methodology determined that the value of 2 is acceptable. Details are in the changelog for the patch "x86: mm: Change tlb_flushall_shift for IvyBridge". One important aspect of this to watch out for is Xen. The original commit log mentioned large performance gains on Xen. It's possible Xen is more sensitive to this value if it flushes small ranges of pages more frequently than workloads on bare metal typically do. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-dyzMww3fqugnhbhgo6Gxmtkw@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 5 +---- arch/x86/kernel/cpu/intel.c | 10 +++------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 59bfebc8c805..96abccaada33 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -768,10 +768,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { - tlb_flushall_shift = 5; - - if (c->x86 <= 0x11) - tlb_flushall_shift = 4; + tlb_flushall_shift = 6; } static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index bbe1b8b1f1c4..d358a3928b8f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -615,21 +615,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) case 0x61d: /* six-core 45 nm xeon "Dunnington" */ tlb_flushall_shift = -1; break; + case 0x63a: /* Ivybridge */ + tlb_flushall_shift = 2; + break; case 0x61a: /* 45 nm nehalem, "Bloomfield" */ case 0x61e: /* 45 nm nehalem, "Lynnfield" */ case 0x625: /* 32 nm nehalem, "Clarkdale" */ case 0x62c: /* 32 nm nehalem, "Gulftown" */ case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ case 0x62f: /* 32 nm Xeon E7 */ - tlb_flushall_shift = 6; - break; case 0x62a: /* SandyBridge */ case 0x62d: /* SandyBridge, "Romely-EP" */ - tlb_flushall_shift = 5; - break; - case 0x63a: /* Ivybridge */ - tlb_flushall_shift = 2; - break; default: tlb_flushall_shift = 6; } -- cgit v1.2.3 From a85eba8814631d0d48361c8b9a7ee0984e80c03c Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 21 Jan 2014 14:33:15 -0800 Subject: arch/x86/mm/srat: Skip NUMA_NO_NODE while parsing SLIT When ACPI SLIT table has an I/O locality (i.e. a locality unique to an I/O device), numa_set_distance() emits this warning message: NUMA: Warning: node ids are out of bound, from=-1 to=-1 distance=10 acpi_numa_slit_init() calls numa_set_distance() with pxm_to_node(), which assumes that all localities have been parsed with SRAT previously. SRAT does not list I/O localities, where as SLIT lists all localities including I/Os. Hence, pxm_to_node() returns NUMA_NO_NODE (-1) for an I/O locality. I/O localities are not supported and are ignored today, but emitting such warning message leads to unnecessary confusion. Change acpi_numa_slit_init() to avoid calling numa_set_distance() with NUMA_NO_NODE. Signed-off-by: Toshi Kani Acked-by: David Rientjes Signed-off-by: Andrew Morton Cc: Yinghai Lu Link: http://lkml.kernel.org/n/tip-dSvpjjvp8aMzs1ybkftxohlh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/srat.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 266ca912f62e..5ecf65117e6f 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -42,15 +42,25 @@ static __init inline int srat_disabled(void) return acpi_numa < 0; } -/* Callback for SLIT parsing */ +/* + * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them. I/O localities are + * not supported at this point. + */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { int i, j; - for (i = 0; i < slit->locality_count; i++) - for (j = 0; j < slit->locality_count; j++) + for (i = 0; i < slit->locality_count; i++) { + if (pxm_to_node(i) == NUMA_NO_NODE) + continue; + for (j = 0; j < slit->locality_count; j++) { + if (pxm_to_node(j) == NUMA_NO_NODE) + continue; numa_set_distance(pxm_to_node(i), pxm_to_node(j), slit->entry[slit->locality_count * i + j]); + } + } } /* Callback for Proximity Domain -> x2APIC mapping */ -- cgit v1.2.3 From edc6bc784028f2ef80dff0ff697363ff5a06e3a3 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Tue, 21 Jan 2014 10:41:39 -0800 Subject: x86/intel/mid: Fix X86_INTEL_MID dependencies This patch fixes the following warning: warning: (X86_INTEL_MID) selects INTEL_SCU_IPC which has unmet direct dependencies (X86 && X86_PLATFORM_DEVICES && X86_INTEL_MID) It happens because when selected, X86_INTEL_MID tries to select INTEL_SCU_IPC regardless all its dependencies are met or not. This patch fixes it by adding the missing X86_PLATFORM_DEVICES dependency to X86_INTEL_MID. Reported-by: kbuild test robot Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1390329699-20782-1-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb9af474dfca..3b6922ebf170 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -443,6 +443,7 @@ config X86_INTEL_MID bool "Intel MID platform support" depends on X86_32 depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES depends on PCI depends on PCI_GOANY depends on X86_IO_APIC -- cgit v1.2.3 From 39424e89d64661faa0a2e00c5ad1e6dbeebfa972 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 28 Jan 2014 08:22:11 -0500 Subject: x86, cpu hotplug: Fix stack frame warning in check_irq_vectors_for_cpu_disable() Further discussion here: http://marc.info/?l=linux-kernel&m=139073901101034&w=2 kbuild, 0day kernel build service, outputs the warning: arch/x86/kernel/irq.c:333:1: warning: the frame size of 2056 bytes is larger than 2048 bytes [-Wframe-larger-than=] because check_irq_vectors_for_cpu_disable() allocates two cpumasks on the stack. Fix this by moving the two cpumasks to a global file context. Reported-by: Fengguang Wu Tested-by: David Rientjes Signed-off-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1390915331-27375-1-git-send-email-prarit@redhat.com Cc: Andi Kleen Cc: Michel Lespinasse Cc: Seiji Aguchi Cc: Yang Zhang Cc: Paul Gortmaker Cc: Janet Morgan Cc: Tony Luck Cc: Ruiv Wang Cc: Gong Chen Cc: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dbb60878b744..d99f31d9a750 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -266,6 +266,14 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU + +/* These two declarations are only used in check_irq_vectors_for_cpu_disable() + * below, which is protected by stop_machine(). Putting them on the stack + * results in a stack frame overflow. Dynamically allocating could result in a + * failure so declare these two cpumasks as global. + */ +static struct cpumask affinity_new, online_new; + /* * This cpu is going to be removed and its vectors migrated to the remaining * online cpus. Check to see if there are enough vectors in the remaining cpus. @@ -277,7 +285,6 @@ int check_irq_vectors_for_cpu_disable(void) unsigned int this_cpu, vector, this_count, count; struct irq_desc *desc; struct irq_data *data; - struct cpumask affinity_new, online_new; this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); -- cgit v1.2.3 From 85fc73a2cdf10cf42bc36fb3bca3896b2095a1c2 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Sat, 1 Feb 2014 13:30:19 +0100 Subject: x86: Fix the initialization of physnode_map With DISCONTIGMEM, the mapping between a pfn and its owning node is initialized using data provided by the BIOS. However, the initialization may fail if the extents are not aligned to section boundary (64M). The symptom of this bug is an early boot failure in pfn_to_page(), as it tries to access NODE_DATA(__nid) using index from an unitialized element of the physnode_map[] array. While the bug is always present, it is more likely to be hit in kdump kernels on large machines, because: 1. The memory map for a kdump kernel is specified as exactmap, and exactmap is more likely to be unaligned. 2. Large reservations are more likely to span across a 64M boundary. [ hpa: fixed incorrect use of "pfn" instead of "start" ] Signed-off-by: Petr Tesarik Link: http://lkml.kernel.org/r/20140201133019.32e56f86@hananiah.suse.cz Acked-by: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0342d27ca798..47b6436e41c2 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); + start = round_down(start, PAGES_PER_SECTION); + end = round_up(end, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { physnode_map[pfn / PAGES_PER_SECTION] = nid; printk(KERN_CONT "%lx ", pfn); -- cgit v1.2.3 From e85fc9805591a17ca8af50023ee8e2b61d9a123b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 3 Feb 2014 06:43:59 -0500 Subject: Revert "xen/grant-table: Avoid m2p_override during mapping" This reverts commit 08ece5bb2312b4510b161a6ef6682f37f4eac8a1. As it breaks ARM builds and needs more attention on the ARM side. Acked-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 5 +-- arch/x86/xen/p2m.c | 17 ++++++- drivers/block/xen-blkback/blkback.c | 15 ++++--- drivers/xen/gntdev.c | 13 +++--- drivers/xen/grant-table.c | 89 ++++++------------------------------- include/xen/grant_table.h | 8 +--- 6 files changed, 46 insertions(+), 101 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 787e1bb5aafc..3e276eb23d1b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -52,8 +52,7 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op); extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn); + struct gnttab_map_grant_ref *kmap_op); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); @@ -122,7 +121,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) pfn = m2p_find_override_pfn(mfn, ~0); } - /* + /* * pfn is ~0 if there are no entries in the m2p for mfn or if the * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8009acbe41e4..696c694986d0 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -899,6 +899,13 @@ int m2p_add_override(unsigned long mfn, struct page *page, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } + WARN_ON(PagePrivate(page)); + SetPagePrivate(page); + set_page_private(page, mfn); + page->index = pfn_to_mfn(pfn); + + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + return -ENOMEM; if (kmap_op != NULL) { if (!PageHighMem(page)) { @@ -937,16 +944,19 @@ int m2p_add_override(unsigned long mfn, struct page *page, } EXPORT_SYMBOL_GPL(m2p_add_override); int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn) + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; + unsigned long mfn; unsigned long pfn; unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return -EINVAL; if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); @@ -960,7 +970,10 @@ int m2p_remove_override(struct page *page, spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); + WARN_ON(!PagePrivate(page)); + ClearPagePrivate(page); + set_phys_to_machine(pfn, page->index); if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 875025f299b6..6620b73d0490 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -285,7 +285,8 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || !rb_next(&persistent_gnt->node)) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, + segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); segs_to_unmap = 0; @@ -320,7 +321,8 @@ static void unmap_purged_grants(struct work_struct *work) pages[segs_to_unmap] = persistent_gnt->page; if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, + segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); segs_to_unmap = 0; @@ -328,7 +330,7 @@ static void unmap_purged_grants(struct work_struct *work) kfree(persistent_gnt); } if (segs_to_unmap > 0) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); } @@ -668,14 +670,15 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, GNTMAP_host_map, pages[i]->handle); pages[i]->handle = BLKBACK_INVALID_HANDLE; if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { - ret = gnttab_unmap_refs(unmap, unmap_pages, invcount); + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, + invcount); BUG_ON(ret); put_free_pages(blkif, unmap_pages, invcount); invcount = 0; } } if (invcount) { - ret = gnttab_unmap_refs(unmap, unmap_pages, invcount); + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); put_free_pages(blkif, unmap_pages, invcount); } @@ -737,7 +740,7 @@ again: } if (segs_to_map) { - ret = gnttab_map_refs(map, pages_to_gnt, segs_to_map); + ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); BUG_ON(ret); } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 34a2704fbc88..073b4a19a8b0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -284,10 +284,8 @@ static int map_grant_pages(struct grant_map *map) } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs_userspace(map->map_ops, - use_ptemod ? map->kmap_ops : NULL, - map->pages, - map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; @@ -317,10 +315,9 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) } } - err = gnttab_unmap_refs_userspace(map->unmap_ops + offset, - use_ptemod ? map->kmap_ops + offset : NULL, - map->pages + offset, - pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, + use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, + pages); if (err) return err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 8ee13e2e45e2..b84e3ab839aa 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -928,17 +928,15 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); -int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count, - bool m2p_override) + struct page **pages, unsigned int count) { int i, ret; bool lazy = false; pte_t *pte; - unsigned long mfn, pfn; + unsigned long mfn; - BUG_ON(kmap_ops && !m2p_override); ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) return ret; @@ -957,12 +955,10 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, map_ops[i].dev_bus_addr >> PAGE_SHIFT); } - return 0; + return ret; } - if (m2p_override && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); lazy = true; } @@ -979,20 +975,8 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, } else { mfn = PFN_DOWN(map_ops[i].dev_bus_addr); } - pfn = page_to_pfn(pages[i]); - - WARN_ON(PagePrivate(pages[i])); - SetPagePrivate(pages[i]); - set_page_private(pages[i], mfn); - - pages[i]->index = pfn_to_mfn(pfn); - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; - } - if (m2p_override) - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); + ret = m2p_add_override(mfn, pages[i], kmap_ops ? + &kmap_ops[i] : NULL); if (ret) goto out; } @@ -1003,32 +987,15 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; } - -int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_map_refs(map_ops, NULL, pages, count, false); -} EXPORT_SYMBOL_GPL(gnttab_map_refs); -int gnttab_map_refs_userspace(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_map_refs(map_ops, kmap_ops, pages, count, true); -} -EXPORT_SYMBOL_GPL(gnttab_map_refs_userspace); - -int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count, - bool m2p_override) + struct page **pages, unsigned int count) { int i, ret; bool lazy = false; - unsigned long pfn, mfn; - BUG_ON(kmap_ops && !m2p_override); ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; @@ -1039,33 +1006,17 @@ int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, INVALID_P2M_ENTRY); } - return 0; + return ret; } - if (m2p_override && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); lazy = true; } for (i = 0; i < count; i++) { - pfn = page_to_pfn(pages[i]); - mfn = get_phys_to_machine(pfn); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { - ret = -EINVAL; - goto out; - } - - set_page_private(pages[i], INVALID_P2M_ENTRY); - WARN_ON(!PagePrivate(pages[i])); - ClearPagePrivate(pages[i]); - set_phys_to_machine(pfn, pages[i]->index); - if (m2p_override) - ret = m2p_remove_override(pages[i], - kmap_ops ? - &kmap_ops[i] : NULL, - mfn); + ret = m2p_remove_override(pages[i], kmap_ops ? + &kmap_ops[i] : NULL); if (ret) goto out; } @@ -1076,22 +1027,8 @@ int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } - -int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *map_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_unmap_refs(map_ops, NULL, pages, count, false); -} EXPORT_SYMBOL_GPL(gnttab_unmap_refs); -int gnttab_unmap_refs_userspace(struct gnttab_unmap_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_unmap_refs(map_ops, kmap_ops, pages, count, true); -} -EXPORT_SYMBOL_GPL(gnttab_unmap_refs_userspace); - static unsigned nr_status_frames(unsigned nr_grant_frames) { BUG_ON(grefs_per_grant_frame == 0); diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 7ad033dbc845..a5af2a26d94f 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -191,15 +191,11 @@ void gnttab_free_auto_xlat_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -int gnttab_map_refs_userspace(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kunmap_ops, struct page **pages, unsigned int count); -int gnttab_unmap_refs_userspace(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kunmap_ops, - struct page **pages, unsigned int count); /* Perform a batch of grant map/copy operations. Retry every batch slot * for which the hypervisor returns GNTST_eagain. This is typically due -- cgit v1.2.3 From afca50132cfa7bfc5646676d8c67dc18454f38f8 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Wed, 29 Jan 2014 16:15:18 -0800 Subject: xen/pvh: set CR4 flags for APs During bootup in the 'probe_page_size_mask' these CR4 flags are set in there. But for AP processors they are not set as we do not use 'secondary_startup_64' which the baremetal kernels uses. Instead do it in this function which we use in Xen PVH during our startup for AP processors. As such fix it up to make sure we have that flag set. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a4d7b647867f..201d09a7c46b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1473,6 +1473,18 @@ static void xen_pvh_set_cr_flags(int cpu) * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); + + if (!cpu) + return; + /* + * For BSP, PSE PGE are set in probe_page_size_mask(), for APs + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. + */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + if (cpu_has_pge) + set_in_cr4(X86_CR4_PGE); } /* -- cgit v1.2.3 From f8f202348208fa8a2d817b42f250e145fa885620 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Feb 2014 06:51:37 +0100 Subject: x86: Disable CONFIG_X86_DECODER_SELFTEST in allmod/allyesconfigs It can take some time to validate the image, make sure {allyes|allmod}config doesn't enable it. I'd say randconfig will cover it often enough, and the failure is also borderline build coverage related: you cannot really make the decoder test fail via source level changes, only with changes in the build environment, so I agree with Andi that we can disable this one too. Signed-off-by: Ingo Molnar Acked-by: Paul Gortmaker paul.gortmaker@windriver.com> Suggested-and-acked-by: Andi Kleen andi@firstfloor.org> Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 0f3621ed1db6..321a52ccf63a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -184,6 +184,7 @@ config HAVE_MMIOTRACE_SUPPORT config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" depends on DEBUG_KERNEL && KPROBES + depends on !COMPILE_TEST ---help--- Perform x86 instruction decoder selftests at build time. This option is useful for checking the sanity of x86 instruction -- cgit v1.2.3 From 081cd62a010f97b5bc1d2b0cd123c5abc692b68a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 14 Jan 2014 12:40:09 +0000 Subject: x86/efi: Allow mapping BGRT on x86-32 CONFIG_X86_32 doesn't map the boot services regions into the EFI memory map (see commit 700870119f49 ("x86, efi: Don't map Boot Services on i386")), and so efi_lookup_mapped_addr() will fail to return a valid address. Executing the ioremap() path in efi_bgrt_init() causes the following warning on x86-32 because we're trying to ioremap() RAM, WARNING: CPU: 0 PID: 0 at arch/x86/mm/ioremap.c:102 __ioremap_caller+0x2ad/0x2c0() Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.13.0-0.rc5.git0.1.2.fc21.i686 #1 Hardware name: DellInc. Venue 8 Pro 5830/09RP78, BIOS A02 10/17/2013 00000000 00000000 c0c0df08 c09a5196 00000000 c0c0df38 c0448c1e c0b41310 00000000 00000000 c0b37bc1 00000066 c043bbfd c043bbfd 00e7dfe0 00073eff 00073eff c0c0df48 c0448ce2 00000009 00000000 c0c0df9c c043bbfd 00078d88 Call Trace: [] dump_stack+0x41/0x52 [] warn_slowpath_common+0x7e/0xa0 [] ? __ioremap_caller+0x2ad/0x2c0 [] ? __ioremap_caller+0x2ad/0x2c0 [] warn_slowpath_null+0x22/0x30 [] __ioremap_caller+0x2ad/0x2c0 [] ? acpi_tb_verify_table+0x1c/0x43 [] ? acpi_get_table_with_size+0x63/0xb5 [] ? efi_lookup_mapped_addr+0xe/0xf0 [] ioremap_nocache+0x1b/0x20 [] ? efi_bgrt_init+0x83/0x10c [] efi_bgrt_init+0x83/0x10c [] efi_late_init+0x8/0xa [] start_kernel+0x3ae/0x3c3 [] ? repair_env_string+0x51/0x51 [] i386_start_kernel+0x12e/0x131 Switch to using early_memremap(), which won't trigger this warning, and has the added benefit of more accurately conveying what we're trying to do - map a chunk of memory. This patch addresses the following bug report, https://bugzilla.kernel.org/show_bug.cgi?id=67911 Reported-by: Adam Williamson Cc: Josh Triplett Cc: Matthew Garrett Cc: Rafael J. Wysocki Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi-bgrt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 7145ec63c520..4df9591eadad 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -49,7 +49,8 @@ void __init efi_bgrt_init(void) image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { - image = ioremap(bgrt_tab->image_address, sizeof(bmp_header)); + image = early_memremap(bgrt_tab->image_address, + sizeof(bmp_header)); ioremapped = true; if (!image) return; @@ -57,7 +58,7 @@ void __init efi_bgrt_init(void) memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); if (ioremapped) - iounmap(image); + early_iounmap(image, sizeof(bmp_header)); bgrt_image_size = bmp_header.size; bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); @@ -65,7 +66,8 @@ void __init efi_bgrt_init(void) return; if (ioremapped) { - image = ioremap(bgrt_tab->image_address, bmp_header.size); + image = early_memremap(bgrt_tab->image_address, + bmp_header.size); if (!image) { kfree(bgrt_image); bgrt_image = NULL; @@ -75,5 +77,5 @@ void __init efi_bgrt_init(void) memcpy_fromio(bgrt_image, image, bgrt_image_size); if (ioremapped) - iounmap(image); + early_iounmap(image, bmp_header.size); } -- cgit v1.2.3 From 75a1ba5b2c529db60ca49626bcaf0bddf4548438 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 3 Feb 2014 21:41:44 +0100 Subject: x86, microcode, AMD: Unify valid container checks For additional coverage, BorisO and friends unknowlingly did swap AMD microcode with Intel microcode blobs in order to see what happens. What did happen on 32-bit was [ 5.722656] BUG: unable to handle kernel paging request at be3a6008 [ 5.722693] IP: [] load_microcode_amd+0x24/0x3f0 [ 5.722716] *pdpt = 0000000000000000 *pde = 0000000000000000 because there was a valid initrd there but without valid microcode in it and the container check happened *after* the relocated ramdisk handling on 32-bit, which was clearly wrong. While at it, take care of the ramdisk relocation on both 32- and 64-bit as it is done on both. Also, comment what we're doing because this code is a bit tricky. Reported-and-tested-by: Boris Ostrovsky Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1391460104-7261-1-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/microcode/amd_early.c | 43 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 8384c0fa206f..617a9e284245 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -285,6 +285,15 @@ static void __init collect_cpu_sig_on_bsp(void *arg) uci->cpu_sig.sig = cpuid_eax(0x00000001); } + +static void __init get_bsp_sig(void) +{ + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} #else void load_ucode_amd_ap(void) { @@ -337,31 +346,37 @@ void load_ucode_amd_ap(void) int __init save_microcode_in_initrd_amd(void) { + unsigned long cont; enum ucode_state ret; u32 eax; -#ifdef CONFIG_X86_32 - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + if (!container) + return -EINVAL; +#ifdef CONFIG_X86_32 + get_bsp_sig(); + cont = (unsigned long)container; +#else /* - * Take into account the fact that the ramdisk might get relocated - * and therefore we need to recompute the container's position in - * virtual memory space. + * We need the physical address of the container for both bitness since + * boot_params.hdr.ramdisk_image is a physical address. */ - container = (u8 *)(__va((u32)relocated_ramdisk) + - ((u32)container - boot_params.hdr.ramdisk_image)); + cont = __pa(container); #endif + + /* + * Take into account the fact that the ramdisk might get relocated and + * therefore we need to recompute the container's position in virtual + * memory space. + */ + if (relocated_ramdisk) + container = (u8 *)(__va(relocated_ramdisk) + + (cont - boot_params.hdr.ramdisk_image)); + if (ucode_new_rev) pr_info("microcode: updated early to new patch_level=0x%08x\n", ucode_new_rev); - if (!container) - return -EINVAL; - eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -- cgit v1.2.3 From 017c217a26e9bf6948482f751b30d0507e30a7d0 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 6 Feb 2014 12:04:25 -0800 Subject: arch/x86/mm/numa.c: initialize numa_kernel_nodes in numa_clear_kernel_node_hotplug() On-stack variable numa_kernel_nodes in numa_clear_kernel_node_hotplug() was not initialized. So we need to initialize it. [akpm@linux-foundation.org: use NODE_MASK_NONE, per David] Signed-off-by: Tang Chen Tested-by: Gu Zheng Reported-by: Dave Jones Reported-by: David Rientjes Tested-by: Dave Jones Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750f3666..45ec9d72b6dd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -565,7 +565,7 @@ static void __init numa_init_array(void) static void __init numa_clear_kernel_node_hotplug(void) { int i, nid; - nodemask_t numa_kernel_nodes; + nodemask_t numa_kernel_nodes = NODE_MASK_NONE; unsigned long start, end; struct memblock_type *type = &memblock.reserved; -- cgit v1.2.3 From 7bc35fdde6724549a0239b71e08b9f33d8bf2bfb Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 6 Feb 2014 12:04:27 -0800 Subject: arch/x86/mm/numa.c: fix array index overflow when synchronizing nid to memblock.reserved. The following path will cause array out of bound. memblock_add_region() will always set nid in memblock.reserved to MAX_NUMNODES. In numa_register_memblks(), after we set all nid to correct valus in memblock.reserved, we called setup_node_data(), and used memblock_alloc_nid() to allocate memory, with nid set to MAX_NUMNODES. The nodemask_t type can be seen as a bit array. And the index is 0 ~ MAX_NUMNODES-1. After that, when we call node_set() in numa_clear_kernel_node_hotplug(), the nodemask_t got an index of value MAX_NUMNODES, which is out of [0 ~ MAX_NUMNODES-1]. See below: numa_init() |---> numa_register_memblks() | |---> memblock_set_node(memory) set correct nid in memblock.memory | |---> memblock_set_node(reserved) set correct nid in memblock.reserved | |...... | |---> setup_node_data() | |---> memblock_alloc_nid() here, nid is set to MAX_NUMNODES (1024) |...... |---> numa_clear_kernel_node_hotplug() |---> node_set() here, we have an index 1024, and overflowed This patch moves nid setting to numa_clear_kernel_node_hotplug() to fix this problem. Reported-by: Dave Jones Signed-off-by: Tang Chen Tested-by: Gu Zheng Reported-by: Dave Jones Cc: David Rientjes Tested-by: Dave Jones Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 45ec9d72b6dd..27aa0455fab3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) struct numa_memblk *mb = &mi->blk[i]; memblock_set_node(mb->start, mb->end - mb->start, &memblock.memory, mb->nid); - - /* - * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. - */ - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); } /* @@ -569,6 +561,17 @@ static void __init numa_clear_kernel_node_hotplug(void) unsigned long start, end; struct memblock_type *type = &memblock.reserved; + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = &numa_meminfo.blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + } + /* Mark all kernel nodes. */ for (i = 0; i < type->cnt; i++) node_set(type->regions[i].nid, numa_kernel_nodes); -- cgit v1.2.3 From e97df76377b8b3b1f7dfd5d6f8a1d5a31438b140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Feb 2014 20:48:51 +0100 Subject: perf/x86/intel/p6: Add userspace RDPMC quirk for PPro PPro machines can die hard when PCE gets enabled due to a CPU erratum. The safe way it so disable it by default and keep it disabled. See erratum 26 in: http://download.intel.com/design/archives/processors/pro/docs/24268935.pdf Reported-and-Tested-by: Mark Davies Cc: Alan Cox Cc: Stephane Eranian Cc: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140206170815.GW2936@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ++++- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_p6.c | 48 +++++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b88645191fe5..1246b853c4e0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1521,6 +1521,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); @@ -1534,7 +1536,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0, 0); - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (x86_pmu.event_attrs) @@ -1820,6 +1821,9 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (ret) return ret; + if (x86_pmu.attr_rdpmc_broken) + return -ENOTSUPP; + if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; smp_call_function(change_rdpmc, (void *)val, 1); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index c1a861829d81..4972c244d0bc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -409,6 +409,7 @@ struct x86_pmu { /* * sysfs attrs */ + int attr_rdpmc_broken; int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index b1e2fe115323..7c1a0c07b607 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = { }; +static __init void p6_pmu_rdpmc_quirk(void) +{ + if (boot_cpu_data.x86_mask < 9) { + /* + * PPro erratum 26; fixed in stepping 9 and above. + */ + pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); + x86_pmu.attr_rdpmc_broken = 1; + x86_pmu.attr_rdpmc = 0; + } +} + __init int p6_pmu_init(void) { + x86_pmu = p6_pmu; + switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - case 9: - case 13: - /* Pentium M */ + case 1: /* Pentium Pro */ + x86_add_quirk(p6_pmu_rdpmc_quirk); + break; + + case 3: /* Pentium II - Klamath */ + case 5: /* Pentium II - Deschutes */ + case 6: /* Pentium II - Mendocino */ break; + + case 7: /* Pentium III - Katmai */ + case 8: /* Pentium III - Coppermine */ + case 10: /* Pentium III Xeon */ + case 11: /* Pentium III - Tualatin */ + break; + + case 9: /* Pentium M - Banias */ + case 13: /* Pentium M - Dothan */ + break; + default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); + pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); return -ENODEV; } - x86_pmu = p6_pmu; - memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - return 0; } -- cgit v1.2.3 From 0e9f2204cfa6d79abe3e525ddf7c4ab5792cc751 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Feb 2014 11:19:56 +0100 Subject: perf/x86: Fix Userspace RDPMC switch The current code forgets to change the CR4 state on the current CPU. Use on_each_cpu() instead of smp_call_function(). Reported-by: Mark Davies Suggested-by: Mark Davies Signed-off-by: Peter Zijlstra Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/n/tip-69efsat90ibhnd577zy3z9gh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1246b853c4e0..895604f2e916 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1826,7 +1826,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; - smp_call_function(change_rdpmc, (void *)val, 1); + on_each_cpu(change_rdpmc, (void *)val, 1); } return count; -- cgit v1.2.3 From 569d6557ab957d6ae7e97a46ae669174be4189e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 4 Feb 2014 14:13:15 -0500 Subject: x86: Use preempt_disable_notrace() in cycles_2_ns() When debug preempt is enabled, preempt_disable() can be traced by function and function graph tracing. There's a place in the function graph tracer that calls trace_clock() which eventually calls cycles_2_ns() outside of the recursion protection. When cycles_2_ns() calls preempt_disable() it gets traced and the graph tracer will go into a recursive loop causing a crash or worse, a triple fault. Simple fix is to use preempt_disable_notrace() in cycles_2_ns, which makes sense because the preempt_disable() tracing may use that code too, and it tracing it, even with recursion protection is rather pointless. Signed-off-by: Steven Rostedt Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140204141315.2a968a72@gandalf.local.home Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 19e5adb49a27..acb3b606613e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -209,7 +209,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) * dance when its actually needed. */ - preempt_disable(); + preempt_disable_notrace(); data = this_cpu_read(cyc2ns.head); tail = this_cpu_read(cyc2ns.tail); @@ -229,7 +229,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) if (!--data->__count) this_cpu_write(cyc2ns.tail, data); } - preempt_enable(); + preempt_enable_notrace(); return ns; } -- cgit v1.2.3 From a9c8e4beeeb64c22b84c803747487857fe424b68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 10 Feb 2014 14:25:40 -0800 Subject: xen: properly account for _PAGE_NUMA during xen pte translations Steven Noonan forwarded a users report where they had a problem starting vsftpd on a Xen paravirtualized guest, with this in dmesg: BUG: Bad page map in process vsftpd pte:8000000493b88165 pmd:e9cc01067 page:ffffea00124ee200 count:0 mapcount:-1 mapping: (null) index:0x0 page flags: 0x2ffc0000000014(referenced|dirty) addr:00007f97eea74000 vm_flags:00100071 anon_vma:ffff880e98f80380 mapping: (null) index:7f97eea74 CPU: 4 PID: 587 Comm: vsftpd Not tainted 3.12.7-1-ec2 #1 Call Trace: dump_stack+0x45/0x56 print_bad_pte+0x22e/0x250 unmap_single_vma+0x583/0x890 unmap_vmas+0x65/0x90 exit_mmap+0xc5/0x170 mmput+0x65/0x100 do_exit+0x393/0x9e0 do_group_exit+0xcc/0x140 SyS_exit_group+0x14/0x20 system_call_fastpath+0x1a/0x1f Disabling lock debugging due to kernel taint BUG: Bad rss-counter state mm:ffff880e9ca60580 idx:0 val:-1 BUG: Bad rss-counter state mm:ffff880e9ca60580 idx:1 val:1 The issue could not be reproduced under an HVM instance with the same kernel, so it appears to be exclusive to paravirtual Xen guests. He bisected the problem to commit 1667918b6483 ("mm: numa: clear numa hinting information on mprotect") that was also included in 3.12-stable. The problem was related to how xen translates ptes because it was not accounting for the _PAGE_NUMA bit. This patch splits pte_present to add a pteval_present helper for use by xen so both bare metal and xen use the same code when checking if a PTE is present. [mgorman@suse.de: wrote changelog, proposed minor modifications] [akpm@linux-foundation.org: fix typo in comment] Reported-by: Steven Noonan Tested-by: Steven Noonan Signed-off-by: Elena Ufimtseva Signed-off-by: Mel Gorman Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 14 ++++++++++++-- arch/x86/xen/mmu.c | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index bbc8b12fa443..5ad38ad07890 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -445,10 +445,20 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline int pteval_present(pteval_t pteval) +{ + /* + * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this + * way clearly states that the intent is that protnone and numa + * hinting ptes are considered present for the purposes of + * pagetable operations like zapping, protection changes, gup etc. + */ + return pteval & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA); +} + static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); + return pteval_present(pte_flags(a)); } #define pte_accessible pte_accessible diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2423ef04ffea..256282e7888b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, /* Assume pteval_t is equivalent to all the other *val_t types. */ static pteval_t pte_mfn_to_pfn(pteval_t val) { - if (val & _PAGE_PRESENT) { + if (pteval_present(val)) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { - if (val & _PAGE_PRESENT) { + if (pteval_present(val)) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; unsigned long mfn; -- cgit v1.2.3 From c091c71ad2218fc50a07b3d1dab85783f3b77efd Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 24 Jan 2014 14:49:58 +0100 Subject: x86: dma-mapping: fix GFP_ATOMIC macro usage GFP_ATOMIC is not a single gfp flag, but a macro which expands to the other flags, where meaningful is the LACK of __GFP_WAIT flag. To check if caller wants to perform an atomic allocation, the code must test for a lack of the __GFP_WAIT flag. This patch fixes the issue introduced in v3.5-rc1. CC: stable@vger.kernel.org Signed-off-by: Marek Szyprowski --- arch/x86/kernel/pci-dma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 872079a67e4d..f7d0672481fd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,8 +100,10 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: page = NULL; - if (!(flag & GFP_ATOMIC)) + /* CMA can be used only in the context which permits sleeping */ + if (flag & __GFP_WAIT) page = dma_alloc_from_contiguous(dev, count, get_order(size)); + /* fallback */ if (!page) page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) -- cgit v1.2.3 From 87fbb2ac6073a7039303517546a76074feb14c84 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 11 Feb 2014 20:19:44 -0500 Subject: ftrace/x86: Use breakpoints for converting function graph caller When the conversion was made to remove stop machine and use the breakpoint logic instead, the modification of the function graph caller is still done directly as though it was being done under stop machine. As it is not converted via stop machine anymore, there is a possibility that the code could be layed across cache lines and if another CPU is accessing that function graph call when it is being updated, it could cause a General Protection Fault. Convert the update of the function graph caller to use the breakpoint method as well. Cc: H. Peter Anvin Cc: stable@vger.kernel.org # 3.5+ Fixes: 08d636b6d4fb "ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine" Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 83 +++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d4bdd253fea7..e6253195a301 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -77,8 +77,7 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } -static int -do_ftrace_mod_code(unsigned long ip, const void *new_code) +static unsigned long text_ip_addr(unsigned long ip) { /* * On x86_64, kernel text mappings are mapped read-only with @@ -91,7 +90,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa_symbol(ip)); - return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); + return ip; } static const unsigned char *ftrace_nop_replace(void) @@ -123,8 +122,10 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) return -EINVAL; + ip = text_ip_addr(ip); + /* replace the text with the new text */ - if (do_ftrace_mod_code(ip, new_code)) + if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) return -EPERM; sync_core(); @@ -221,37 +222,51 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return -EINVAL; } -int ftrace_update_ftrace_func(ftrace_func_t func) +static unsigned long ftrace_update_func; + +static int update_ftrace_func(unsigned long ip, void *new) { - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[MCOUNT_INSN_SIZE], *new; + unsigned char old[MCOUNT_INSN_SIZE]; int ret; - memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, (unsigned long)func); + memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); + + ftrace_update_func = ip; + /* Make sure the breakpoints see the ftrace_update_func update */ + smp_wmb(); /* See comment above by declaration of modifying_ftrace_code */ atomic_inc(&modifying_ftrace_code); ret = ftrace_modify_code(ip, old, new); + atomic_dec(&modifying_ftrace_code); + + return ret; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char *new; + int ret; + + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + /* Also update the regs callback function */ if (!ret) { ip = (unsigned long)(&ftrace_regs_call); - memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); - ret = ftrace_modify_code(ip, old, new); + ret = update_ftrace_func(ip, new); } - atomic_dec(&modifying_ftrace_code); - return ret; } static int is_ftrace_caller(unsigned long ip) { - if (ip == (unsigned long)(&ftrace_call) || - ip == (unsigned long)(&ftrace_regs_call)) + if (ip == ftrace_update_func) return 1; return 0; @@ -677,45 +692,41 @@ int __init ftrace_dyn_arch_init(void *data) #ifdef CONFIG_DYNAMIC_FTRACE extern void ftrace_graph_call(void); -static int ftrace_mod_jmp(unsigned long ip, - int old_offset, int new_offset) +static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { - unsigned char code[MCOUNT_INSN_SIZE]; + static union ftrace_code_union calc; - if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; + /* Jmp not a call (ignore the .e8) */ + calc.e8 = 0xe9; + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); - if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) - return -EINVAL; + /* + * ftrace external locks synchronize the access to the static variable. + */ + return calc.code; +} - *(int *)(&code[1]) = new_offset; +static int ftrace_mod_jmp(unsigned long ip, void *func) +{ + unsigned char *new; - if (do_ftrace_mod_code(ip, &code)) - return -EPERM; + new = ftrace_jmp_replace(ip, (unsigned long)func); - return 0; + return update_ftrace_func(ip, new); } int ftrace_enable_ftrace_graph_caller(void) { unsigned long ip = (unsigned long)(&ftrace_graph_call); - int old_offset, new_offset; - old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); - new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); - - return ftrace_mod_jmp(ip, old_offset, new_offset); + return ftrace_mod_jmp(ip, &ftrace_graph_caller); } int ftrace_disable_ftrace_graph_caller(void) { unsigned long ip = (unsigned long)(&ftrace_graph_call); - int old_offset, new_offset; - - old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); - new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); - return ftrace_mod_jmp(ip, old_offset, new_offset); + return ftrace_mod_jmp(ip, &ftrace_stub); } #endif /* !CONFIG_DYNAMIC_FTRACE */ -- cgit v1.2.3 From 03bbd596ac04fef47ce93a730b8f086d797c3021 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 13 Feb 2014 07:34:30 -0800 Subject: x86, smap: Don't enable SMAP if CONFIG_X86_SMAP is disabled If SMAP support is not compiled into the kernel, don't enable SMAP in CR4 -- in fact, we should clear it, because the kernel doesn't contain the proper STAC/CLAC instructions for SMAP support. Found by Fengguang Wu's test system. Reported-by: Fengguang Wu Link: http://lkml.kernel.org/r/20140213124550.GA30497@localhost Signed-off-by: H. Peter Anvin Cc: # v3.7+ --- arch/x86/kernel/cpu/common.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 24b6fd10625a..8e28bf2fc3ef 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -284,8 +284,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) + if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP set_in_cr4(X86_CR4_SMAP); +#else + clear_in_cr4(X86_CR4_SMAP); +#endif + } } /* -- cgit v1.2.3 From 4640c7ee9b8953237d05a61ea3ea93981d1bc961 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 13 Feb 2014 07:46:04 -0800 Subject: x86, smap: smap_violation() is bogus if CONFIG_X86_SMAP is off If CONFIG_X86_SMAP is disabled, smap_violation() tests for conditions which are incorrect (as the AC flag doesn't matter), causing spurious faults. The dynamic disabling of SMAP (nosmap on the command line) is fine because it disables X86_FEATURE_SMAP, therefore causing the static_cpu_has() to return false. Found by Fengguang Wu's test system. [ v3: move all predicates into smap_violation() ] [ v2: use IS_ENABLED() instead of #ifdef ] Reported-by: Fengguang Wu Link: http://lkml.kernel.org/r/20140213124550.GA30497@localhost Signed-off-by: H. Peter Anvin Cc: # v3.7+ --- arch/x86/mm/fault.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d591c895803..6dea040cc3a1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { + if (!IS_ENABLED(CONFIG_X86_SMAP)) + return false; + + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + if (error_code & PF_USER) return false; @@ -1087,11 +1093,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - if (static_cpu_has(X86_FEATURE_SMAP)) { - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address); - return; - } + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; } /* -- cgit v1.2.3 From c55d016f7a930dd1c995336017123b469a8c8f5a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 14 Feb 2014 08:24:24 +0100 Subject: x86/efi: Fix 32-bit fallout We do not enable the new efi memmap on 32-bit and thus we need to run runtime_code_page_mkexec() unconditionally there. Fix that. Reported-and-tested-by: Lejun Zhu Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 2 ++ arch/x86/platform/efi/efi.c | 5 ++--- arch/x86/platform/efi/efi_32.c | 6 ++++++ arch/x86/platform/efi/efi_64.c | 9 +++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3b978c472d08..3d6b9f81cc68 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -132,6 +132,8 @@ extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); +extern void __init runtime_code_page_mkexec(void); +extern void __init efi_runtime_mkexec(void); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index d62ec87a2b26..1a201ac7cef8 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -792,7 +792,7 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable) set_memory_nx(addr, npages); } -static void __init runtime_code_page_mkexec(void) +void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; void *p; @@ -1069,8 +1069,7 @@ void __init efi_enter_virtual_mode(void) efi.update_capsule = virt_efi_update_capsule; efi.query_capsule_caps = virt_efi_query_capsule_caps; - if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) - runtime_code_page_mkexec(); + efi_runtime_mkexec(); kfree(new_memmap); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 249b183cf417..0b74cdf7f816 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -77,3 +77,9 @@ void efi_call_phys_epilog(void) local_irq_restore(efi_rt_eflags); } + +void __init efi_runtime_mkexec(void) +{ + if (__supported_pte_mask & _PAGE_NX) + runtime_code_page_mkexec(); +} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6284f158a47d..0c2a234fef1e 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -233,3 +233,12 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len) { efi_setup = phys_addr + sizeof(struct setup_data); } + +void __init efi_runtime_mkexec(void) +{ + if (!efi_enabled(EFI_OLD_MEMMAP)) + return; + + if (__supported_pte_mask & _PAGE_NX) + runtime_code_page_mkexec(); +} -- cgit v1.2.3 From 09503379dc99535b1bbfa51aa1aeef340f5d82ec Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 13 Feb 2014 17:17:54 +0000 Subject: x86/efi: Check status field to validate BGRT header Madper reported seeing the following crash, BUG: unable to handle kernel paging request at ffffffffff340003 IP: [] efi_bgrt_init+0x9d/0x133 Call Trace: [] efi_late_init+0x9/0xb [] start_kernel+0x436/0x450 [] ? repair_env_string+0x5c/0x5c [] ? early_idt_handlers+0x120/0x120 [] x86_64_start_reservations+0x2a/0x2c [] x86_64_start_kernel+0x13e/0x14d This is caused because the layout of the ACPI BGRT header on this system doesn't match the definition from the ACPI spec, and so we get a bogus physical address when dereferencing ->image_address in efi_bgrt_init(). Luckily the status field in the BGRT header clearly marks it as invalid, so we can check that field and skip BGRT initialisation. Reported-by: Madper Xie Suggested-by: Toshi Kani Cc: "Rafael J. Wysocki" Cc: Matthew Garrett Cc: Josh Triplett Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi-bgrt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 4df9591eadad..f15103dff4b4 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -42,7 +42,7 @@ void __init efi_bgrt_init(void) if (bgrt_tab->header.length < sizeof(*bgrt_tab)) return; - if (bgrt_tab->version != 1) + if (bgrt_tab->version != 1 || bgrt_tab->status != 1) return; if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) return; -- cgit v1.2.3 From 5f0e030930d715920be4de638084aaf8653867e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Feb 2014 13:52:29 +0200 Subject: x86, tsc: Fallback to normal calibration if fast MSR calibration fails If we cannot calibrate TSC via MSR based calibration try_msr_calibrate_tsc() stores zero to fast_calibrate and returns that to the caller. This value gets then propagated further to clockevents code resulting division by zero oops like the one below: divide error: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 3.13.0+ #47 task: ffff880075508000 ti: ffff880075506000 task.ti: ffff880075506000 RIP: 0010:[] [] clockevents_config.part.3+0x24/0xa0 RSP: 0000:ffff880075507e58 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffff880079c0cd80 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffffffffff RBP: ffff880075507e70 R08: 0000000000000001 R09: 00000000000000be R10: 00000000000000bd R11: 0000000000000003 R12: 000000000000b008 R13: 0000000000000008 R14: 000000000000b010 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880079c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff880079fff000 CR3: 0000000001c0b000 CR4: 00000000001006f0 Stack: ffff880079c0cd80 000000000000b008 0000000000000008 ffff880075507e88 ffffffff810aecb0 ffff880079c0cd80 ffff880075507e98 ffffffff81030168 ffff880075507ed8 ffffffff81d1104f 00000000000000c3 0000000000000000 Call Trace: [] clockevents_config_and_register+0x20/0x30 [] setup_APIC_timer+0xc8/0xd0 [] setup_boot_APIC_clock+0x4cc/0x4d8 [] native_smp_prepare_cpus+0x3dd/0x3f0 [] kernel_init_freeable+0xc3/0x205 [] ? rest_init+0x90/0x90 [] kernel_init+0xe/0x120 [] ret_from_fork+0x7c/0xb0 [] ? rest_init+0x90/0x90 Prevent this from happening by: 1) Modifying try_msr_calibrate_tsc() to return calibration value or zero if it fails. 2) Check this return value in native_calibrate_tsc() and in case of zero fallback to use normal non-MSR based calibration. [mw: Added subject and changelog] Reported-and-tested-by: Mika Westerberg Signed-off-by: Thomas Gleixner Cc: Bin Gao Cc: One Thousand Gnomes Cc: Ingo Molnar Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/1392810750-18660-1-git-send-email-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 2 +- arch/x86/kernel/tsc.c | 7 ++----- arch/x86/kernel/tsc_msr.c | 28 ++++++++++++++-------------- 3 files changed, 17 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 57ae63cd6ee2..94605c0e9cee 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -66,6 +66,6 @@ extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); /* MSR based TSC calibration for Intel Atom SoC platforms */ -int try_msr_calibrate_tsc(unsigned long *fast_calibrate); +unsigned long try_msr_calibrate_tsc(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index acb3b606613e..cfbe99f88830 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -653,13 +653,10 @@ unsigned long native_calibrate_tsc(void) /* Calibrate TSC using MSR for Intel Atom SoCs */ local_irq_save(flags); - i = try_msr_calibrate_tsc(&fast_calibrate); + fast_calibrate = try_msr_calibrate_tsc(); local_irq_restore(flags); - if (i >= 0) { - if (i == 0) - pr_warn("Fast TSC calibration using MSR failed\n"); + if (fast_calibrate) return fast_calibrate; - } local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 8b5434f4389f..5dfff5809e74 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -77,21 +77,18 @@ static int match_cpu(u8 family, u8 model) /* * Do MSR calibration only for known/supported CPUs. - * Return values: - * -1: CPU is unknown/unsupported for MSR based calibration - * 0: CPU is known/supported, but calibration failed - * 1: CPU is known/supported, and calibration succeeded + * + * Returns the calibration value or 0 if MSR calibration failed. */ -int try_msr_calibrate_tsc(unsigned long *fast_calibrate) +unsigned long try_msr_calibrate_tsc(void) { - int cpu_index; u32 lo, hi, ratio, freq_id, freq; + unsigned long res; + int cpu_index; cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) - return -1; - - *fast_calibrate = 0; + return 0; if (freq_desc_tables[cpu_index].msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); @@ -103,7 +100,7 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate) pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); if (!ratio) - return 0; + goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); @@ -112,16 +109,19 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate) pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", freq_id, freq); if (!freq) - return 0; + goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ - *fast_calibrate = freq * ratio; - pr_info("TSC runs at %lu KHz\n", *fast_calibrate); + res = freq * ratio; + pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif + return res; - return 1; +fail: + pr_warn("Fast TSC calibration using MSR failed\n"); + return 0; } -- cgit v1.2.3 From 3e11e818bfd7bd4a8e1214970337bab73ffed32d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 19 Feb 2014 13:52:30 +0200 Subject: x86: tsc: Add missing Baytrail frequency to the table Intel Baytrail is based on Silvermont core so MSR_FSB_FREQ[2:0] == 0 means that the CPU reference clock runs at 83.3MHz. Add this missing frequency to the table. Signed-off-by: Mika Westerberg Cc: Bin Gao Cc: One Thousand Gnomes Cc: Ingo Molnar Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/1392810750-18660-2-git-send-email-mika.westerberg@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 5dfff5809e74..92ae6acac8a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -53,7 +53,7 @@ static struct freq_desc freq_desc_tables[] = { /* TNG */ { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, /* VLV2 */ - { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, + { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, /* ANN */ { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, }; -- cgit v1.2.3 From a3ef2229c94ff70998724cb64b9cb4c77db9e950 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 14 Feb 2014 16:44:08 -0800 Subject: perf, nmi: Fix unknown NMI warning When using BTS on Core i7-4*, I get the below kernel warning. $ perf record -c 1 -e branches:u ls Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317893] Uhhuh. NMI received for unknown reason 31 on CPU 2. Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317920] Do you have a strange power saving mode enabled? Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317945] Dazed and confused, but trying to continue Make intel_pmu_handle_irq() take the full exit path when returning early. Cc: eranian@google.com Cc: peterz@infradead.org Cc: mingo@kernel.org Signed-off-by: Markus Metzger Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392425048-5309-1-git-send-email-andi@firstfloor.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 0fa4f242f050..698ae77d6f18 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1361,10 +1361,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); - if (!status) { - intel_pmu_enable_all(0); - return handled; - } + if (!status) + goto done; loops = 0; again: -- cgit v1.2.3 From c9b08884c9c98929ec2d8abafd78e89062d01ee7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 14:29:03 +0100 Subject: perf/x86: Correctly use FEATURE_PDCM The current code simply assumes Intel Arch PerfMon v2+ to have the IA32_PERF_CAPABILITIES MSR; the SDM specifies that we should check CPUID[1].ECX[15] (aka, FEATURE_PDCM) instead. This was found by KVM which implements v2+ but didn't provide the capabilities MSR. Change the code to DTRT; KVM will also implement the MSR and return 0. Cc: pbonzini@redhat.com Reported-by: "Michael S. Tsirkin" Suggested-by: Eduardo Habkost Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140203132903.GI8874@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 698ae77d6f18..aa333d966886 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2308,10 +2308,7 @@ __init int intel_pmu_init(void) if (version > 1) x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - /* - * v2 and above have a perf capabilities MSR - */ - if (version > 1) { + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); -- cgit v1.2.3 From 337397f3afc911d94d1d71371a36a53ce218b41f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 19 Feb 2014 14:10:18 +0100 Subject: perf/x86/uncore: Fix IVT/SNB-EP uncore CBOX NID filter table This patch updates the CBOX PMU filters mapping tables for SNB-EP and IVT (model 45 and 62 respectively). The NID umask always comes in addition to another umask. When set, the NID filter is applied. The current mapping tables were missing some code/umask combinations to account for the NID umask. This patch fixes that. Cc: mingo@elte.hu Cc: ak@linux.intel.com Reviewed-by: Yan, Zheng Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140219131018.GA24475@quad Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 29c248799ced..c88f7f4b03ee 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -501,8 +501,11 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, SNBEP_CBO_PMON_CTL_TID_EN, 0x1), SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), @@ -1178,10 +1181,15 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, SNBEP_CBO_PMON_CTL_TID_EN, 0x1), SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), -- cgit v1.2.3 From b6085a865762236bb84934161273cdac6dd11c2d Mon Sep 17 00:00:00 2001 From: Eugene Surovegin Date: Thu, 23 Jan 2014 09:31:20 -0800 Subject: x86, kaslr: export offset in VMCOREINFO ELF notes Include kASLR offset in VMCOREINFO ELF notes to assist in debugging. [ hpa: pushing this for v3.14 to avoid having a kernel version with kASLR where we can't debug output. ] Signed-off-by: Eugene Surovegin Link: http://lkml.kernel.org/r/20140123173120.GA25474@www.outflux.net Signed-off-by: Kees Cook Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4eabc160696f..679cef0791cd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -279,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", + (unsigned long)&_text - __START_KERNEL); } -- cgit v1.2.3 From e290e8c59dbc2a15088d868170d799f763202fef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 9 Feb 2014 13:56:44 -0800 Subject: x86, kaslr: add missed "static" declarations This silences build warnings about unexported variables and functions. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20140209215644.GA30339@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 90a21f430117..4dbf967da50d 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -111,7 +111,7 @@ struct mem_vector { }; #define MEM_AVOID_MAX 5 -struct mem_vector mem_avoid[MEM_AVOID_MAX]; +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; static bool mem_contains(struct mem_vector *region, struct mem_vector *item) { @@ -180,7 +180,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, } /* Does this memory vector overlap a known avoided area? */ -bool mem_avoid_overlap(struct mem_vector *img) +static bool mem_avoid_overlap(struct mem_vector *img) { int i; @@ -192,8 +192,9 @@ bool mem_avoid_overlap(struct mem_vector *img) return false; } -unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN]; -unsigned long slot_max = 0; +static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / + CONFIG_PHYSICAL_ALIGN]; +static unsigned long slot_max; static void slots_append(unsigned long addr) { -- cgit v1.2.3 From 404381c5839d67aa0c275ad1da96ef3d3928ca2c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 24 Feb 2014 13:59:32 -0300 Subject: KVM: MMU: drop read-only large sptes when creating lower level sptes Read-only large sptes can be created due to read-only faults as follows: - QEMU pagetable entry that maps guest memory is read-only due to COW. - Guest read faults such memory, COW is not broken, because it is a read-only fault. - Enable dirty logging, large spte not nuked because it is read-only. - Write-fault on such memory causes guest to loop endlessly (which must go down to level 1 because dirty logging is enabled). Fix by dropping large spte when necessary. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e50425d0f5f7..9b531351a587 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2672,6 +2672,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, break; } + drop_large_spte(vcpu, iterator.sptep); if (!is_shadow_present_pte(*iterator.sptep)) { u64 base_addr = iterator.addr; -- cgit v1.2.3 From 26e61e8939b1fe8729572dabe9a9e97d930dd4f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Feb 2014 16:03:12 +0100 Subject: perf/x86: Fix event scheduling Vince "Super Tester" Weaver reported a new round of syscall fuzzing (Trinity) failures, with perf WARN_ON()s triggering. He also provided traces of the failures. This is I think the relevant bit: > pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_disable: x86_pmu_disable > pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_state: Events: { > pec_1076_warn-2804 [000] d... 147.926156: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null)) > pec_1076_warn-2804 [000] d... 147.926158: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926159: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926160: x86_pmu_state: n_events: 1, n_added: 0, n_txn: 1 > pec_1076_warn-2804 [000] d... 147.926161: x86_pmu_state: Assignment: { > pec_1076_warn-2804 [000] d... 147.926162: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926163: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926166: collect_events: Adding event: 1 (ffff880119ec8800) So we add the insn:p event (fd[23]). At this point we should have: n_events = 2, n_added = 1, n_txn = 1 > pec_1076_warn-2804 [000] d... 147.926170: collect_events: Adding event: 0 (ffff8800c9e01800) > pec_1076_warn-2804 [000] d... 147.926172: collect_events: Adding event: 4 (ffff8800cbab2c00) We try and add the {BP,cycles,br_insn} group (fd[3], fd[4], fd[15]). These events are 0:cycles and 4:br_insn, the BP event isn't x86_pmu so that's not visible. group_sched_in() pmu->start_txn() /* nop - BP pmu */ event_sched_in() event->pmu->add() So here we should end up with: 0: n_events = 3, n_added = 2, n_txn = 2 4: n_events = 4, n_added = 3, n_txn = 3 But seeing the below state on x86_pmu_enable(), the must have failed, because the 0 and 4 events aren't there anymore. Looking at group_sched_in(), since the BP is the leader, its event_sched_in() must have succeeded, for otherwise we would not have seen the sibling adds. But since neither 0 or 4 are in the below state; their event_sched_in() must have failed; but I don't see why, the complete state: 0,0,1:p,4 fits perfectly fine on a core2. However, since we try and schedule 4 it means the 0 event must have succeeded! Therefore the 4 event must have failed, its failure will have put group_sched_in() into the fail path, which will call: event_sched_out() event->pmu->del() on 0 and the BP event. Now x86_pmu_del() will reduce n_events; but it will not reduce n_added; giving what we see below: n_event = 2, n_added = 2, n_txn = 2 > pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_enable: x86_pmu_enable > pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_state: Events: { > pec_1076_warn-2804 [000] d... 147.926179: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null)) > pec_1076_warn-2804 [000] d... 147.926181: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926182: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: n_events: 2, n_added: 2, n_txn: 2 > pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: Assignment: { > pec_1076_warn-2804 [000] d... 147.926186: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: 1->0 tag: 1 config: 1 (ffff880119ec8800) > pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926190: x86_pmu_enable: S0: hwc->idx: 33, hwc->last_cpu: 0, hwc->last_tag: 1 hwc->state: 0 So the problem is that x86_pmu_del(), when called from a group_sched_in() that fails (for whatever reason), and without x86_pmu TXN support (because the leader is !x86_pmu), will corrupt the n_added state. Reported-and-Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Stephane Eranian Cc: Dave Jones Cc: Link: http://lkml.kernel.org/r/20140221150312.GF3104@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 895604f2e916..79f9f848bee4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1192,6 +1192,9 @@ static void x86_pmu_del(struct perf_event *event, int flags) for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; + if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, event); -- cgit v1.2.3 From a08d3b3b99efd509133946056531cdf8f3a0c09b Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Thu, 27 Feb 2014 19:35:14 +0100 Subject: kvm: x86: fix emulator buffer overflow (CVE-2014-0049) The problem occurs when the guest performs a pusha with the stack address pointing to an mmio address (or an invalid guest physical address) to start with, but then extending into an ordinary guest physical address. When doing repeated emulated pushes emulator_read_write sets mmio_needed to 1 on the first one. On a later push when the stack points to regular memory, mmio_nr_fragments is set to 0, but mmio_is_needed is not set to 0. As a result, KVM exits to userspace, and then returns to complete_emulated_mmio. In complete_emulated_mmio vcpu->mmio_cur_fragment is incremented. The termination condition of vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments is never achieved. The code bounces back and fourth to userspace incrementing mmio_cur_fragment past it's buffer. If the guest does nothing else it eventually leads to a a crash on a memcpy from invalid memory address. However if a guest code can cause the vm to be destroyed in another vcpu with excellent timing, then kvm_clear_async_pf_completion_queue can be used by the guest to control the data that's pointed to by the call to cancel_work_item, which can be used to gain execution. Fixes: f78146b0f9230765c6315b2e14f56112513389ad Signed-off-by: Andrew Honig Cc: stable@vger.kernel.org (3.5+) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39c28f09dfd5..2b8578432d5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6186,7 +6186,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) frag->len -= len; } - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; /* FIXME: return into emulator if single-stepping. */ -- cgit v1.2.3 From 1b385cbdd74aa803e966e01e5fe49490d6044e30 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Feb 2014 22:54:11 +0100 Subject: kvm, vmx: Really fix lazy FPU on nested guest Commit e504c9098ed6 (kvm, vmx: Fix lazy FPU on nested guest, 2013-11-13) highlighted a real problem, but the fix was subtly wrong. nested_read_cr0 is the CR0 as read by L2, but here we want to look at the CR0 value reflecting L1's setup. In other words, L2 might think that TS=0 (so nested_read_cr0 has the bit clear); but if L1 is actually running it with TS=1, we should inject the fault into L1. The effective value of CR0 in L2 is contained in vmcs12->guest_cr0, use it. Fixes: e504c9098ed6acd9e1079c5e10e4910724ad429f Reported-by: Kashyap Chamarty Reported-by: Stefan Bader Tested-by: Kashyap Chamarty Tested-by: Anthoine Bourgeois Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a06f101ef64b..392752834751 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6688,7 +6688,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && - !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + !(vmcs12->guest_cr0 & X86_CR0_TS)) return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); -- cgit v1.2.3