From fe4a330885aee20f233de36085fb15c38094e635 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:44:06 +0100 Subject: perf, x86: Implement user-space RDPMC support, to allow fast, user-space access to self-monitoring counters Implement a correct pmu::event_idx for the x86 counter index rules and set CR4.PCE on CPU_STARTING. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Cc: Thomas Gleixner Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/n/tip-mwxab34dibqgzk5zywutfnha@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..53b569910175 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1210,6 +1210,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1542,6 +1543,18 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { + idx -= X86_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -1557,6 +1570,8 @@ static struct pmu pmu = { .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, }; /* -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 8 ++++++ include/linux/perf_event.h | 1 + kernel/events/core.c | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 53b569910175..116b040a73a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - set_in_cr4(X86_CR4_PCE); + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void) } } + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event) return idx + 1; } +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + NULL, +}; + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, + .attr_groups = x86_pmu_attr_groups, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..513d617b93c4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -307,6 +307,14 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; + /* + * sysfs attrs + */ + int attr_rdpmc; + + /* + * CPU Hotplug hooks + */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 05affc3878ff..dcd4049e92fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++ include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 116b040a73a8..f8bddb5b0600 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "perf_event.h" @@ -1627,6 +1628,19 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return; + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + return; + + userpg->time_mult = this_cpu_read(cyc2ns); + userpg->time_shift = CYC2NS_SCALE_FACTOR; + userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} + /* * callchain support */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dcd4049e92fc..3a9c7d81afbf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3 From cd42f4a3b2b1c4cbd997363dc57821953d73fd87 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 15 Dec 2011 10:48:12 -0800 Subject: HWPOISON: Clean up memory_failure() vs. __memory_failure() There is only one caller of memory_failure(), all other users call __memory_failure() and pass in the flags argument explicitly. The lone user of memory_failure() will soon need to pass flags too. Add flags argument to the callsite in mce.c. Delete the old memory_failure() function, and then rename __memory_failure() without the leading "__". Provide clearer message when action optional memory errors are ignored. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 12 +++++++---- drivers/base/memory.c | 2 +- include/linux/mm.h | 3 +-- mm/hwpoison-inject.c | 4 ++-- mm/madvise.c | 2 +- mm/memory-failure.c | 46 ++++++++++++++++++---------------------- 6 files changed, 34 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d1..1a08ce5f345f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1046,11 +1046,15 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + + return 0; } +#endif /* * Called after mce notification in process context. This code @@ -1068,7 +1072,7 @@ void mce_notify_process(void) unsigned long pfn; mce_notify_irq(); while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + memory_failure(pfn, MCE_VECTOR, 0); } static void mce_process_work(struct work_struct *dummy) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8272d92d22c0..9a924440053f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -474,7 +474,7 @@ store_hard_offline_page(struct class *class, if (strict_strtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - ret = __memory_failure(pfn, 0, 0); + ret = memory_failure(pfn, 0, 0); return ret ? ret : count; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4baadd18f4ad..bcc523474724 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1607,8 +1607,7 @@ void vmemmap_populate_print_last(void); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, }; -extern void memory_failure(unsigned long pfn, int trapno); -extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c7fc7fd00e32..cc448bb983ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val) * do a racy check with elevated page count, to make sure PG_hwpoison * will only be set for the targeted owner (or on a free page). * We temporarily take page lock for try_get_mem_cgroup_from_page(). - * __memory_failure() will redo the check reliably inside page lock. + * memory_failure() will redo the check reliably inside page lock. */ lock_page(hpage); err = hwpoison_filter(hpage); @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) inject: printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); - return __memory_failure(pfn, 18, MF_COUNT_INCREASED); + return memory_failure(pfn, 18, MF_COUNT_INCREASED); } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/madvise.c b/mm/madvise.c index 74bf193eff04..f5ab745672b7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..ab259bb0adc5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -984,7 +984,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) ClearPageHWPoison(hpage + i); } -int __memory_failure(unsigned long pfn, int trapno, int flags) +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; @@ -1156,29 +1174,7 @@ out: unlock_page(hpage); return res; } -EXPORT_SYMBOL_GPL(__memory_failure); - -/** - * memory_failure - Handle memory failure of a page. - * @pfn: Page Number of the corrupted page - * @trapno: Trap number reported in the signal to user space. - * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. - * - * The function is primarily of use for corruptions that - * happen outside the current execution context (e.g. when - * detected by a background scrubber) - * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. - */ -void memory_failure(unsigned long pfn, int trapno) -{ - __memory_failure(pfn, trapno, 0); -} +EXPORT_SYMBOL_GPL(memory_failure); #define MEMORY_FAILURE_FIFO_ORDER 4 #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) @@ -1251,7 +1247,7 @@ static void memory_failure_work_func(struct work_struct *work) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - __memory_failure(entry.pfn, entry.trapno, entry.flags); + memory_failure(entry.pfn, entry.trapno, entry.flags); } } -- cgit v1.2.3 From 85f92694affa7dba7f1978666a69552b5dfc628e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 13 Dec 2011 09:48:13 -0800 Subject: x86/mce: Create helper function to save addr/misc when needed The MCI_STATUS_MISCV and MCI_STATUS_ADDRV bits in the bank status registers define whether the MISC and ADDR registers respectively contain valid data - provide a helper function to check these bits and read the registers when needed. In addition, processors that support software error recovery (as indicated by the MCG_SER_P bit in the MCG_CAP register) may include some undefined bits in the ADDR register - mask these out. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1a08ce5f345f..2f1c200f05e6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -492,6 +492,27 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&__get_cpu_var(mce_irq_work)); } +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -542,10 +563,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -981,10 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (severity == MCE_AR_SEVERITY) kill_it = 1; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. -- cgit v1.2.3 From af104e394e17e328df85c25a9e21448539725b67 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 14 Dec 2011 15:55:20 -0800 Subject: x86/mce: Add mechanism to safely save information in MCE handler Machine checks on Intel cpus interrupt execution on all cpus, regardless of interrupt masking. We have a need to save some data about the cause of the machine check (physical address) in the machine check handler that can be retrieved later to attempt recovery in a more flexible execution state. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2f1c200f05e6..e1579c5a71da 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -886,6 +886,49 @@ static void mce_clear_state(unsigned long *toclear) } } +/* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. -- cgit v1.2.3 From a8c321fbf9aeced45519248e5901af8cbc240510 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Jan 2012 11:45:45 -0800 Subject: x86/mce: Handle "action required" errors All non-urgent actions (reporting low severity errors and handling "action-optional" errors) are now handled by a work queue. This means that TIF_MCE_NOTIFY can be used to block execution for a thread experiencing an "action-required" fault until we get all cpus out of the machine check handler (and the thread that hit the fault into mce_notify_process(). We use the new mce_{save,find,clear}_info() API to get information from do_machine_check() to mce_notify_process(), and then use the newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle the error (possibly signalling the process). Update some comments to make the new code flows clearer. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 95 ++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e1579c5a71da..56e4e79387c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - mce_read_aux(&m, i); /* @@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); @@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int vector, int flags) { + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); @@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR, 0); + struct mce_info *mi = mce_find_info(); + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1225,8 +1238,6 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); -- cgit v1.2.3 From 5f7b88d51e89771f64c15903b96b5933dd0bc6d8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Jan 2012 11:48:04 -0800 Subject: x86/mce: Recognise machine check bank signature for data path error Action required data path signature is defined in table 15-19 of SDM: +-----------------------------------------------------------------------------+ | SRAR Error | Valid | OVER | UC | EN | MISCV | ADDRV | PCC | S | AR | MCACOD | | Data Load | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0x134 | +-----------------------------------------------------------------------------+ Recognise this, and pass MCE_AR_SEVERITY code back to do_machine_check() if we have the action handler configured (CONFIG_MEMORY_FAILURE=y) Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272d..f6c92f99efa0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -54,6 +54,7 @@ static struct severity { #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) #define MCACOD 0xffff MCESEV( @@ -102,11 +103,24 @@ static struct severity { SER, BITCLR(MCI_STATUS_S) ), - /* AR add known MCACODs here */ MCESEV( PANIC, "Action required with lost events", SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "HT thread notices Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + USER + ), +#endif MCESEV( PANIC, "Action required: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) -- cgit v1.2.3 From b3eea29c189a0e3e2ac921e85fabfa4989ee58d7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 26 Jan 2012 17:32:04 +0000 Subject: x86/ioapic: Use legacy_pic to set correct gsi-irq mapping Using compile time NR_LEGACY_IRQS causes the wrong gsi-irq mapping on non-PC platforms, such as Moorestown. This patch uses legacy_pic abstraction to set the correct number of legacy interrupts at runtime. For Moorestown, nr_legacy_irqs = 0. We have 1:1 mapping for gsi-irq even within the legacy irq range. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Link: http://lkml.kernel.org/n/tip-kzvj4xp9tmicuoqoh2w05iay@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..9e753663f0d1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin) } else { u32 gsi = gsi_cfg->gsi_base + pin; - if (gsi >= NR_IRQS_LEGACY) + if (gsi >= legacy_pic->nr_legacy_irqs) irq = gsi; else irq = gsi_top + gsi; @@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void) { int nr; - nr = gsi_top + NR_IRQS_LEGACY; + nr = gsi_top + legacy_pic->nr_legacy_irqs; if (nr > nr_irqs_gsi) nr_irqs_gsi = nr; -- cgit v1.2.3 From d450c088fb00d5a744b1fe8648a488035a10a03c Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 26 Jan 2012 17:32:33 +0000 Subject: x86/mrst: Set ISA bus type for fake MP IRQs We use MP IRQs for SFI presented timer interrupts, we should also set mp_bus_not_pci for MP_ISA_BUS so that pin_2_irq mapping is correct. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Link: http://lkml.kernel.org/n/tip-8h3rc1igpp8ir94aas69qmhk@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9e753663f0d1..fb072754bc1d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin) } else { u32 gsi = gsi_cfg->gsi_base + pin; - if (gsi >= legacy_pic->nr_legacy_irqs) + if (gsi >= NR_IRQS_LEGACY) irq = gsi; else irq = gsi_top + gsi; @@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void) { int nr; - nr = gsi_top + legacy_pic->nr_legacy_irqs; + nr = gsi_top + NR_IRQS_LEGACY; if (nr > nr_irqs_gsi) nr_irqs_gsi = nr; -- cgit v1.2.3 From 08dda402d60a721ac94e79efd7646b332be3e3b2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 26 Jan 2012 16:02:22 -0800 Subject: x86/mce: Replace hard coded hex constants with symbolic defines Magic constants like 0x0134 in code just invite questions on where they come from, what they mean, can they be changed. Provide #defines for the architecturally defined MCACOD values with a reference to the Intel Software Developers manual which describes them. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index f6c92f99efa0..0c82091b1652 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -56,6 +56,12 @@ static struct severity { #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) #define MCACOD 0xffff +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ +#define MCACOD_DATA 0x0134 /* Data Load */ +#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ MCESEV( NO, "Invalid", @@ -112,12 +118,12 @@ static struct severity { #ifdef CONFIG_MEMORY_FAILURE MCESEV( KEEP, "HT thread notices Action required: data load error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), MCGMASK(MCG_STATUS_EIPV, 0) ), MCESEV( AR, "Action required: data load error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), #endif @@ -129,11 +135,11 @@ static struct severity { /* known AO MCACODs: */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) ), MCESEV( SOME, "Action optional: unknown MCACOD", -- cgit v1.2.3 From 644e9cbbe3fc032cc92d0936057e166a994dc246 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:05 +0100 Subject: Add driver auto probing for x86 features v4 There's a growing number of drivers that support a specific x86 feature or CPU. Currently loading these drivers currently on a generic distribution requires various driver specific hacks and it often doesn't work. This patch adds auto probing for drivers based on the x86 cpuid information, in particular based on vendor/family/model number and also based on CPUID feature bits. For example a common issue is not loading the SSE 4.2 accelerated CRC module: this can significantly lower the performance of BTRFS which relies on fast CRC. Another issue is loading the right CPUFREQ driver for the current CPU. Currently distributions often try all all possible driver until one sticks, which is not really a good way to do this. It works with existing udev without any changes. The code exports the x86 information as a generic string in sysfs that can be matched by udev's pattern matching. This scheme does not support numeric ranges, so if you want to handle e.g. ranges of model numbers they have to be encoded in ASCII or simply all models or families listed. Fixing that would require changing udev. Another issue is that udev will happily load all drivers that match, there is currently no nice way to stop a specific driver from being loaded if it's not needed (e.g. if you don't need fast CRC) But there are not that many cpu specific drivers around and they're all not that bloated, so this isn't a particularly serious issue. Originally this patch added the modalias to the normal cpu sysdevs. However sysdevs don't have all the infrastructure needed for udev, so it couldn't really autoload drivers. This patch instead adds the CPU modaliases to the cpuid devices, which are real devices with full support for udev. This implies that the cpuid driver has to be loaded to use this. This patch just adds infrastructure, some driver conversions in followups. Thanks to Kay for helping with some sysfs magic. v2: Constifcation, some updates v4: (trenn@suse.de): - Use kzalloc instead of kmalloc to terminate modalias buffer - Use uppercase hex values to match correctly against hex values containing letters Cc: Dave Jones Cc: Kay Sievers Cc: Jen Axboe Cc: Herbert Xu Cc: Huang Ying Cc: Len Brown Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpu_device_id.h | 13 ++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/match.c | 48 +++++++++++++++++++++++++++++ arch/x86/kernel/cpuid.c | 59 +++++++++++++++++++++++++++++++++++- include/linux/mod_devicetable.h | 21 +++++++++++++ scripts/mod/file2alias.c | 24 +++++++++++++++ 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/cpu_device_id.h create mode 100644 arch/x86/kernel/cpu/match.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h new file mode 100644 index 000000000000..ff501e511d91 --- /dev/null +++ b/arch/x86/include/asm/cpu_device_id.h @@ -0,0 +1,13 @@ +#ifndef _CPU_DEVICE_ID +#define _CPU_DEVICE_ID 1 + +/* + * Declare drivers belonging to specific x86 CPUs + * Similar in spirit to pci_device_id and related PCI functions + */ + +#include + +extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dccdcfa..6ab6aa2fdfdd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 000000000000..7acc961422e7 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f2..7c89880eefd0 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,57 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static ssize_t print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" + "%04X:model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); + static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; + int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = device_create_file(dev, &dev_attr_modalias); + if (err) { + /* keep device around on error. attribute is optional. */ + err = 0; + } + + return 0; } static void cpuid_device_destroy(int cpu) @@ -182,6 +227,17 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } +static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} + static int __init cpuid_init(void) { int i, err = 0; @@ -200,6 +256,7 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b29e7f6f8fa5..cff2cc08f45a 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -571,4 +571,25 @@ struct amba_id { #endif }; +/* + * Match x86 CPUs for CPU specific drivers. + * See documentation of "x86_match_cpu" for details. + */ + +struct x86_cpu_id { + __u16 vendor; + __u16 family; + __u16 model; + __u16 feature; /* bit index */ + kernel_ulong_t driver_data; +}; + +#define X86_FEATURE_MATCH(x) \ + { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x } + +#define X86_VENDOR_ANY 0xffff +#define X86_FAMILY_ANY 0 +#define X86_MODEL_ANY 0 +#define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index c0e14b3f2306..026ba38759ca 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1013,6 +1013,30 @@ static int do_amba_entry(const char *filename, } ADD_TO_DEVTABLE("amba", struct amba_id, do_amba_entry); +/* LOOKS like x86cpu:vendor:VVVV:family:FFFF:model:MMMM:feature:*,FEAT,* + * All fields are numbers. It would be nicer to use strings for vendor + * and feature, but getting those out of the build system here is too + * complicated. + */ + +static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id, + char *alias) +{ + id->feature = TO_NATIVE(id->feature); + id->family = TO_NATIVE(id->family); + id->model = TO_NATIVE(id->model); + id->vendor = TO_NATIVE(id->vendor); + + strcpy(alias, "x86cpu:"); + ADD(alias, "vendor:", id->vendor != X86_VENDOR_ANY, id->vendor); + ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family); + ADD(alias, ":model:", id->model != X86_MODEL_ANY, id->model); + ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature); + strcat(alias, ",*"); + return 1; +} +ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry); + /* Does namelen bytes of name exactly match the symbol? */ static bool sym_is(const char *name, unsigned namelen, const char *symbol) { -- cgit v1.2.3 From 2f1e097e24defe64a86535b53768f5c8ab0368d1 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 26 Jan 2012 00:09:11 +0100 Subject: X86: Introduce HW-Pstate scattered cpuid feature It is rather similar to CPB (boot capability) feature and exists since fam10h (can be looked up in AMD's BKDG). The feature is needed for powernow-k8 to cleanup init functions and to provide proper autoloading matching with the new x86cpu modalias feature. Cc: Kay Sievers Cc: Dave Jones Cc: Borislav Petkov Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5e..67b0910ebbb8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537a..addf9e82a7f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, -- cgit v1.2.3 From 78ff123b05fb15beb1ad670372eea0d299d0b8af Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:13 +0100 Subject: x86: autoload microcode driver on Intel and AMD systems v2 Don't try to describe the actual models for now. v2: Fix typo: X86_VENDOR_ANY -> X86_FAMILY_ANY (trenn) Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/microcode_core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fda91c307104..87a0f8688301 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -86,6 +86,7 @@ #include #include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); -- cgit v1.2.3 From fad12ac8c8c2591c7f4e61d19b6a9d76cd49fafa Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 26 Jan 2012 00:09:14 +0100 Subject: CPU: Introduce ARCH_HAS_CPU_AUTOPROBE and X86 parts This patch is based on Andi Kleen's work: Implement autoprobing/loading of modules serving CPU specific features (x86cpu autoloading). And Kay Siever's work to get rid of sysdev cpu structures and making use of struct device instead. Before, the cpuid driver had to be loaded to get the x86cpu autoloading feature. With this patch autoloading works through the /sys/devices/system/cpu object Cc: Kay Sievers Cc: Dave Jones Cc: Jens Axboe Cc: Herbert Xu Cc: Huang Ying Cc: Len Brown Acked-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 3 +++ arch/x86/kernel/cpu/match.c | 44 +++++++++++++++++++++++++++++++++ arch/x86/kernel/cpuid.c | 59 +-------------------------------------------- drivers/base/cpu.c | 11 +++++++++ include/linux/cpu.h | 7 ++++++ 5 files changed, 66 insertions(+), 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..6baa1e66e1bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -179,6 +179,9 @@ config ARCH_HAS_DEFAULT_IDLE config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_HAS_CPU_AUTOPROBE + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 7acc961422e7..940e2d483076 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -2,6 +2,7 @@ #include #include #include +#include /** * x86_match_cpu - match current CPU again an array of x86_cpu_ids @@ -46,3 +47,46 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); + +ssize_t arch_print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" + "model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + arch_print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7c89880eefd0..a524353d93f2 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include @@ -139,57 +138,13 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static ssize_t print_cpu_modalias(struct device *dev, - struct device_attribute *attr, - char *bufptr) -{ - int size = PAGE_SIZE; - int i, n; - char *buf = bufptr; - - n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" - "%04X:model:%04X:feature:", - boot_cpu_data.x86_vendor, - boot_cpu_data.x86, - boot_cpu_data.x86_model); - size -= n; - buf += n; - size -= 2; - for (i = 0; i < NCAPINTS*32; i++) { - if (boot_cpu_has(i)) { - n = snprintf(buf, size, ",%04X", i); - if (n < 0) { - WARN(1, "x86 features overflow page\n"); - break; - } - size -= n; - buf += n; - } - } - *buf++ = ','; - *buf++ = '\n'; - return buf - bufptr; -} - -static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); - static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - err = device_create_file(dev, &dev_attr_modalias); - if (err) { - /* keep device around on error. attribute is optional. */ - err = 0; - } - - return 0; + return IS_ERR(dev) ? PTR_ERR(dev) : 0; } static void cpuid_device_destroy(int cpu) @@ -227,17 +182,6 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } -static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (buf) { - print_cpu_modalias(NULL, NULL, buf); - add_uevent_var(env, "MODALIAS=%s", buf); - kfree(buf); - } - return 0; -} - static int __init cpuid_init(void) { int i, err = 0; @@ -256,7 +200,6 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; - cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index db87e78d7459..2a0c670c281d 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "base.h" @@ -223,6 +224,9 @@ int __cpuinit register_cpu(struct cpu *cpu, int num) cpu->node_id = cpu_to_node(num); cpu->dev.id = num; cpu->dev.bus = &cpu_subsys; +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE + cpu->dev.bus->uevent = arch_cpu_uevent; +#endif error = device_register(&cpu->dev); if (!error && cpu->hotpluggable) register_cpu_control(cpu); @@ -247,6 +251,10 @@ struct device *get_cpu_device(unsigned cpu) } EXPORT_SYMBOL_GPL(get_cpu_device); +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE +static DEVICE_ATTR(modalias, 0444, arch_print_cpu_modalias, NULL); +#endif + static struct attribute *cpu_root_attrs[] = { #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE &dev_attr_probe.attr, @@ -257,6 +265,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[2].attr.attr, &dev_attr_kernel_max.attr, &dev_attr_offline.attr, +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE + &dev_attr_modalias.attr, +#endif NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 1f6587590a1a..6e53b4823d7f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -44,6 +44,13 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE +extern int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env); +extern ssize_t arch_print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr); +#endif + /* * CPU notifier priorities. */ -- cgit v1.2.3 From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:38:29 +0100 Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices The current device suspend/resume phases during system-wide power transitions appear to be insufficient for some platforms that want to use the same callback routines for saving device states and related operations during runtime suspend/resume as well as during system suspend/resume. In principle, they could point their .suspend_noirq() and .resume_noirq() to the same callback routines as their .runtime_suspend() and .runtime_resume(), respectively, but at least some of them require device interrupts to be enabled while the code in those routines is running. It also makes sense to have device suspend-resume callbacks that will be executed with runtime PM disabled and with device interrupts enabled in case someone needs to run some special code in that context during system-wide power transitions. Apart from this, .suspend_noirq() and .resume_noirq() were introduced as a workaround for drivers using shared interrupts and failing to prevent their interrupt handlers from accessing suspended hardware. It appears to be better not to use them for other porposes, or we may have to deal with some serious confusion (which seems to be happening already). For the above reasons, introduce new device suspend/resume phases, "late suspend" and "early resume" (and analogously for hibernation) whose callback will be executed with runtime PM disabled and with device interrupts enabled and whose callback pointers generally may point to runtime suspend/resume routines. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mark Brown Reviewed-by: Kevin Hilman --- Documentation/power/devices.txt | 93 +++++++++------ arch/x86/kernel/apm_32.c | 11 +- drivers/base/power/main.c | 247 ++++++++++++++++++++++++++++++++++++---- drivers/xen/manage.c | 6 +- include/linux/pm.h | 43 +++++-- include/linux/suspend.h | 4 + kernel/kexec.c | 8 +- kernel/power/hibernate.c | 24 ++-- kernel/power/main.c | 8 +- kernel/power/suspend.c | 4 +- 10 files changed, 357 insertions(+), 91 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 20af7def23c8..872815cd41d3 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -96,6 +96,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -305,7 +311,7 @@ Entering System Suspend ----------------------- When the system goes into the standby or memory sleep state, the phases are: - prepare, suspend, suspend_noirq. + prepare, suspend, suspend_late, suspend_noirq. 1. The prepare phase is meant to prevent races by preventing new devices from being registered; the PM core would never know that all the @@ -324,7 +330,12 @@ When the system goes into the standby or memory sleep state, the phases are: appropriate low-power state, depending on the bus type the device is on, and they may enable wakeup events. - 3. The suspend_noirq phase occurs after IRQ handlers have been disabled, + 3 For a number of devices it is convenient to split suspend into the + "quiesce device" and "save device state" phases, in which cases + suspend_late is meant to do the latter. It is always executed after + runtime power management has been disabled for all devices. + + 4. The suspend_noirq phase occurs after IRQ handlers have been disabled, which means that the driver's interrupt handler will not be called while the callback method is running. The methods should save the values of the device's registers that weren't saved previously and finally put the @@ -359,7 +370,7 @@ Leaving System Suspend ---------------------- When resuming from standby or memory sleep, the phases are: - resume_noirq, resume, complete. + resume_noirq, resume_early, resume, complete. 1. The resume_noirq callback methods should perform any actions needed before the driver's interrupt handlers are invoked. This generally @@ -375,14 +386,18 @@ When resuming from standby or memory sleep, the phases are: device driver's ->pm.resume_noirq() method to perform device-specific actions. - 2. The resume methods should bring the the device back to its operating + 2. The resume_early methods should prepare devices for the execution of + the resume methods. This generally involves undoing the actions of the + preceding suspend_late phase. + + 3 The resume methods should bring the the device back to its operating state, so that it can perform normal I/O. This generally involves undoing the actions of the suspend phase. - 3. The complete phase uses only a bus callback. The method should undo the - actions of the prepare phase. Note, however, that new children may be - registered below the device as soon as the resume callbacks occur; it's - not necessary to wait until the complete phase. + 4. The complete phase should undo the actions of the prepare phase. Note, + however, that new children may be registered below the device as soon as + the resume callbacks occur; it's not necessary to wait until the + complete phase. At the end of these phases, drivers should be as functional as they were before suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are @@ -429,8 +444,8 @@ an image of the system memory while everything is stable, reactivate all devices (thaw), write the image to permanent storage, and finally shut down the system (poweroff). The phases used to accomplish this are: - prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete, - prepare, poweroff, poweroff_noirq + prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early, + thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq 1. The prepare phase is discussed in the "Entering System Suspend" section above. @@ -441,7 +456,11 @@ system (poweroff). The phases used to accomplish this are: save time it's best not to do so. Also, the device should not be prepared to generate wakeup events. - 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed + 3. The freeze_late phase is analogous to the suspend_late phase described + above, except that the device should not be put in a low-power state and + should not be allowed to generate wakeup events by it. + + 4. The freeze_noirq phase is analogous to the suspend_noirq phase discussed above, except again that the device should not be put in a low-power state and should not be allowed to generate wakeup events. @@ -449,15 +468,19 @@ At this point the system image is created. All devices should be inactive and the contents of memory should remain undisturbed while this happens, so that the image forms an atomic snapshot of the system state. - 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed + 5. The thaw_noirq phase is analogous to the resume_noirq phase discussed above. The main difference is that its methods can assume the device is in the same state as at the end of the freeze_noirq phase. - 5. The thaw phase is analogous to the resume phase discussed above. Its + 6. The thaw_early phase is analogous to the resume_early phase described + above. Its methods should undo the actions of the preceding + freeze_late, if necessary. + + 7. The thaw phase is analogous to the resume phase discussed above. Its methods should bring the device back to an operating state, so that it can be used for saving the image if necessary. - 6. The complete phase is discussed in the "Leaving System Suspend" section + 8. The complete phase is discussed in the "Leaving System Suspend" section above. At this point the system image is saved, and the devices then need to be @@ -465,16 +488,19 @@ prepared for the upcoming system shutdown. This is much like suspending them before putting the system into the standby or memory sleep state, and the phases are similar. - 7. The prepare phase is discussed above. + 9. The prepare phase is discussed above. + + 10. The poweroff phase is analogous to the suspend phase. - 8. The poweroff phase is analogous to the suspend phase. + 11. The poweroff_late phase is analogous to the suspend_late phase. - 9. The poweroff_noirq phase is analogous to the suspend_noirq phase. + 12. The poweroff_noirq phase is analogous to the suspend_noirq phase. -The poweroff and poweroff_noirq callbacks should do essentially the same things -as the suspend and suspend_noirq callbacks. The only notable difference is that -they need not store the device register values, because the registers should -already have been stored during the freeze or freeze_noirq phases. +The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially +the same things as the suspend, suspend_late and suspend_noirq callbacks, +respectively. The only notable difference is that they need not store the +device register values, because the registers should already have been stored +during the freeze, freeze_late or freeze_noirq phases. Leaving Hibernation @@ -518,22 +544,25 @@ To achieve this, the image kernel must restore the devices' pre-hibernation functionality. The operation is much like waking up from the memory sleep state, although it involves different phases: - restore_noirq, restore, complete + restore_noirq, restore_early, restore, complete 1. The restore_noirq phase is analogous to the resume_noirq phase. - 2. The restore phase is analogous to the resume phase. + 2. The restore_early phase is analogous to the resume_early phase. + + 3. The restore phase is analogous to the resume phase. - 3. The complete phase is discussed above. + 4. The complete phase is discussed above. -The main difference from resume[_noirq] is that restore[_noirq] must assume the -device has been accessed and reconfigured by the boot loader or the boot kernel. -Consequently the state of the device may be different from the state remembered -from the freeze and freeze_noirq phases. The device may even need to be reset -and completely re-initialized. In many cases this difference doesn't matter, so -the resume[_noirq] and restore[_norq] method pointers can be set to the same -routines. Nevertheless, different callback pointers are used in case there is a -situation where it actually matters. +The main difference from resume[_early|_noirq] is that restore[_early|_noirq] +must assume the device has been accessed and reconfigured by the boot loader or +the boot kernel. Consequently the state of the device may be different from the +state remembered from the freeze, freeze_late and freeze_noirq phases. The +device may even need to be reset and completely re-initialized. In many cases +this difference doesn't matter, so the resume[_early|_noirq] and +restore[_early|_norq] method pointers can be set to the same routines. +Nevertheless, different callback pointers are used in case there is a situation +where it actually does matter. Device Power Management Domains diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623cbe263..5d56931a15b3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1234,8 +1234,7 @@ static int suspend(int vetoable) struct apm_user *as; dpm_suspend_start(PMSG_SUSPEND); - - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1259,9 +1258,9 @@ static int suspend(int vetoable) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); - + dpm_resume_start(PMSG_RESUME); dpm_resume_end(PMSG_RESUME); + queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1276,7 @@ static void standby(void) { int err; - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1291,7 +1290,7 @@ static void standby(void) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); } static apm_event_t get_event(void) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e2cc3d2e0ecc..b462c0e341cb 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -47,6 +47,7 @@ typedef int (*pm_callback_t)(struct device *); LIST_HEAD(dpm_list); LIST_HEAD(dpm_prepared_list); LIST_HEAD(dpm_suspended_list); +LIST_HEAD(dpm_late_early_list); LIST_HEAD(dpm_noirq_list); struct suspend_stats suspend_stats; @@ -245,6 +246,40 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) return NULL; } +/** + * pm_late_early_op - Return the PM operation appropriate for given PM event. + * @ops: PM operations to choose from. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, + pm_message_t state) +{ + switch (state.event) { +#ifdef CONFIG_SUSPEND + case PM_EVENT_SUSPEND: + return ops->suspend_late; + case PM_EVENT_RESUME: + return ops->resume_early; +#endif /* CONFIG_SUSPEND */ +#ifdef CONFIG_HIBERNATE_CALLBACKS + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze_late; + case PM_EVENT_HIBERNATE: + return ops->poweroff_late; + case PM_EVENT_THAW: + case PM_EVENT_RECOVER: + return ops->thaw_early; + case PM_EVENT_RESTORE: + return ops->restore_early; +#endif /* CONFIG_HIBERNATE_CALLBACKS */ + } + + return NULL; +} + /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. @@ -374,21 +409,21 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) TRACE_RESUME(0); if (dev->pm_domain) { - info = "EARLY power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "EARLY type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "EARLY class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "EARLY bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "EARLY driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -399,13 +434,13 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) } /** - * dpm_resume_noirq - Execute "early resume" callbacks for non-sysdev devices. + * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * - * Call the "noirq" resume handlers for all devices marked as DPM_OFF_IRQ and + * Call the "noirq" resume handlers for all devices in dpm_noirq_list and * enable device drivers to receive interrupts. */ -void dpm_resume_noirq(pm_message_t state) +static void dpm_resume_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); @@ -415,7 +450,7 @@ void dpm_resume_noirq(pm_message_t state) int error; get_device(dev); - list_move_tail(&dev->power.entry, &dpm_suspended_list); + list_move_tail(&dev->power.entry, &dpm_late_early_list); mutex_unlock(&dpm_list_mtx); error = device_resume_noirq(dev, state); @@ -423,6 +458,80 @@ void dpm_resume_noirq(pm_message_t state) suspend_stats.failed_resume_noirq++; dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, " noirq", error); + } + + mutex_lock(&dpm_list_mtx); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + dpm_show_time(starttime, state, "noirq"); + resume_device_irqs(); +} + +/** + * device_resume_early - Execute an "early resume" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_resume_early(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + int error = 0; + + TRACE_DEVICE(dev); + TRACE_RESUME(0); + + if (dev->pm_domain) { + info = "early power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "early type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "early class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "early bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "early driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + error = dpm_run_callback(callback, dev, state, info); + + TRACE_RESUME(error); + return error; +} + +/** + * dpm_resume_early - Execute "early resume" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static void dpm_resume_early(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.next); + int error; + + get_device(dev); + list_move_tail(&dev->power.entry, &dpm_suspended_list); + mutex_unlock(&dpm_list_mtx); + + error = device_resume_early(dev, state); + if (error) { + suspend_stats.failed_resume_early++; + dpm_save_failed_step(SUSPEND_RESUME_EARLY); + dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, " early", error); } @@ -431,9 +540,18 @@ void dpm_resume_noirq(pm_message_t state) } mutex_unlock(&dpm_list_mtx); dpm_show_time(starttime, state, "early"); - resume_device_irqs(); } -EXPORT_SYMBOL_GPL(dpm_resume_noirq); + +/** + * dpm_resume_start - Execute "noirq" and "early" device callbacks. + * @state: PM transition of the system being carried out. + */ +void dpm_resume_start(pm_message_t state) +{ + dpm_resume_noirq(state); + dpm_resume_early(state); +} +EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. @@ -716,21 +834,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) char *info = NULL; if (dev->pm_domain) { - info = "LATE power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "LATE type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "LATE class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "LATE bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "LATE driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -738,21 +856,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) } /** - * dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices. + * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers from receiving interrupts and call the "noirq" suspend * handlers for all non-sysdev devices. */ -int dpm_suspend_noirq(pm_message_t state) +static int dpm_suspend_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; suspend_device_irqs(); mutex_lock(&dpm_list_mtx); - while (!list_empty(&dpm_suspended_list)) { - struct device *dev = to_device(dpm_suspended_list.prev); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.prev); get_device(dev); mutex_unlock(&dpm_list_mtx); @@ -761,7 +879,7 @@ int dpm_suspend_noirq(pm_message_t state) mutex_lock(&dpm_list_mtx); if (error) { - pm_dev_err(dev, state, " late", error); + pm_dev_err(dev, state, " noirq", error); suspend_stats.failed_suspend_noirq++; dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_save_failed_dev(dev_name(dev)); @@ -775,11 +893,96 @@ int dpm_suspend_noirq(pm_message_t state) mutex_unlock(&dpm_list_mtx); if (error) dpm_resume_noirq(resume_event(state)); + else + dpm_show_time(starttime, state, "noirq"); + return error; +} + +/** + * device_suspend_late - Execute a "late suspend" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_suspend_late(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + + if (dev->pm_domain) { + info = "late power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "late type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "late class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "late bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "late driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + return dpm_run_callback(callback, dev, state, info); +} + +/** + * dpm_suspend_late - Execute "late suspend" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static int dpm_suspend_late(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + int error = 0; + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_suspended_list)) { + struct device *dev = to_device(dpm_suspended_list.prev); + + get_device(dev); + mutex_unlock(&dpm_list_mtx); + + error = device_suspend_late(dev, state); + + mutex_lock(&dpm_list_mtx); + if (error) { + pm_dev_err(dev, state, " late", error); + suspend_stats.failed_suspend_late++; + dpm_save_failed_step(SUSPEND_SUSPEND_LATE); + dpm_save_failed_dev(dev_name(dev)); + put_device(dev); + break; + } + if (!list_empty(&dev->power.entry)) + list_move(&dev->power.entry, &dpm_late_early_list); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + if (error) + dpm_resume_early(resume_event(state)); else dpm_show_time(starttime, state, "late"); + return error; } -EXPORT_SYMBOL_GPL(dpm_suspend_noirq); + +/** + * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. + * @state: PM transition of the system being carried out. + */ +int dpm_suspend_end(pm_message_t state) +{ + int error = dpm_suspend_late(state); + + return error ? : dpm_suspend_noirq(state); +} +EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ce4fa0831860..9e14ae6cd49c 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -129,9 +129,9 @@ static void do_suspend(void) printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_FREEZE); + err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); + printk(KERN_ERR "dpm_suspend_end failed: %d\n", err); goto out_resume; } @@ -149,7 +149,7 @@ static void do_suspend(void) err = stop_machine(xen_suspend, &si, cpumask_of(0)); - dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); diff --git a/include/linux/pm.h b/include/linux/pm.h index e4982ac3fbbc..c68e1f22ac95 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -110,6 +110,10 @@ typedef struct pm_message { * Subsystem-level @suspend() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @suspend_late: Continue operations started by @suspend(). For a number of + * devices @suspend_late() may point to the same callback routine as the + * runtime suspend callback. + * * @resume: Executed after waking the system up from a sleep state in which the * contents of main memory were preserved. The exact action to perform * depends on the device's subsystem, but generally the driver is expected @@ -122,6 +126,10 @@ typedef struct pm_message { * Subsystem-level @resume() is executed for all devices after invoking * subsystem-level @resume_noirq() for all of them. * + * @resume_early: Prepare to execute @resume(). For a number of devices + * @resume_early() may point to the same callback routine as the runtime + * resume callback. + * * @freeze: Hibernation-specific, executed before creating a hibernation image. * Analogous to @suspend(), but it should not enable the device to signal * wakeup events or change its power state. The majority of subsystems @@ -131,6 +139,10 @@ typedef struct pm_message { * Subsystem-level @freeze() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @freeze_late: Continue operations started by @freeze(). Analogous to + * @suspend_late(), but it should not enable the device to signal wakeup + * events or change its power state. + * * @thaw: Hibernation-specific, executed after creating a hibernation image OR * if the creation of an image has failed. Also executed after a failing * attempt to restore the contents of main memory from such an image. @@ -140,15 +152,23 @@ typedef struct pm_message { * subsystem-level @thaw_noirq() for all of them. It also may be executed * directly after @freeze() in case of a transition error. * + * @thaw_early: Prepare to execute @thaw(). Undo the changes made by the + * preceding @freeze_late(). + * * @poweroff: Hibernation-specific, executed after saving a hibernation image. * Analogous to @suspend(), but it need not save the device's settings in * memory. * Subsystem-level @poweroff() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @poweroff_late: Continue operations started by @poweroff(). Analogous to + * @suspend_late(), but it need not save the device's settings in memory. + * * @restore: Hibernation-specific, executed after restoring the contents of main * memory from a hibernation image, analogous to @resume(). * + * @restore_early: Prepare to execute @restore(), analogous to @resume_early(). + * * @suspend_noirq: Complete the actions started by @suspend(). Carry out any * additional operations required for suspending the device that might be * racing with its driver's interrupt handler, which is guaranteed not to @@ -158,9 +178,10 @@ typedef struct pm_message { * @suspend_noirq() has returned successfully. If the device can generate * system wakeup signals and is enabled to wake up the system, it should be * configured to do so at that time. However, depending on the platform - * and device's subsystem, @suspend() may be allowed to put the device into - * the low-power state and configure it to generate wakeup signals, in - * which case it generally is not necessary to define @suspend_noirq(). + * and device's subsystem, @suspend() or @suspend_late() may be allowed to + * put the device into the low-power state and configure it to generate + * wakeup signals, in which case it generally is not necessary to define + * @suspend_noirq(). * * @resume_noirq: Prepare for the execution of @resume() by carrying out any * operations required for resuming the device that might be racing with @@ -171,9 +192,9 @@ typedef struct pm_message { * additional operations required for freezing the device that might be * racing with its driver's interrupt handler, which is guaranteed not to * run while @freeze_noirq() is being executed. - * The power state of the device should not be changed by either @freeze() - * or @freeze_noirq() and it should not be configured to signal system - * wakeup by any of these callbacks. + * The power state of the device should not be changed by either @freeze(), + * or @freeze_late(), or @freeze_noirq() and it should not be configured to + * signal system wakeup by any of these callbacks. * * @thaw_noirq: Prepare for the execution of @thaw() by carrying out any * operations required for thawing the device that might be racing with its @@ -249,6 +270,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -584,13 +611,13 @@ struct dev_pm_domain { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); -extern void dpm_resume_noirq(pm_message_t state); +extern void dpm_resume_start(pm_message_t state); extern void dpm_resume_end(pm_message_t state); extern void dpm_resume(pm_message_t state); extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); -extern int dpm_suspend_noirq(pm_message_t state); +extern int dpm_suspend_end(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); extern int dpm_suspend(pm_message_t state); extern int dpm_prepare(pm_message_t state); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 91784a4f8608..ac1c114c499d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -42,8 +42,10 @@ enum suspend_stat_step { SUSPEND_FREEZE = 1, SUSPEND_PREPARE, SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; @@ -53,8 +55,10 @@ struct suspend_stats { int failed_freeze; int failed_prepare; int failed_suspend; + int failed_suspend_late; int failed_suspend_noirq; int failed_resume; + int failed_resume_early; int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..a5d4cf0aa03e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +518,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +549,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..8c5014a4e052 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..560a639614a1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) -- cgit v1.2.3 From c98fdeaa92731308ed80386261fa2589addefa47 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Feb 2012 13:08:52 +0100 Subject: x86/sched/perf/AMD: Set sched_clock_stable Stephane Eranian reported that doing a scheduler latency measurements with perf on AMD doesn't work out as expected due to the fact that the sched_clock() granularity is too coarse, i.e. done in jiffies due to the sched_clock_stable not set, which, if set, would mean that we get to use the TSC as sample source which would give us much higher precision. However, there's no reason not to set sched_clock_stable on AMD because all families from F10h and upwards do have an invariant TSC and have the CPUID flag to prove (CPUID_8000_0007_EDX[8]). Make it so, #1. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Venki Pallipadi Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Robert Richter Cc: Eric Dumazet Cc: Andreas Herrmann Link: http://lkml.kernel.org/r/20120206132546.GA30854@quad [ Should any non-standard system break the TSC, we should mark them so explicitly, in their platform init handler, or in a DMI quirk. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4aae35..0a44b90602b0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + if (!check_tsc_unstable()) + sched_clock_stable = 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 21c3fcf3e39353d4f21d50e257cc74f3204b1988 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 12 Feb 2012 09:53:57 -0800 Subject: x86/debug: Fix/improve the show_msr= debug print out Found out that show_msr= is broken, when I asked a user to use it to capture debug info about broken MTRR's whose MTRR settings are probably different between CPUs. Only the first CPUs MSRs are printed, but that is not enough to track down the suspected bug. For years we called print_cpu_msr from print_cpu_info(), but this commit: | commit 2eaad1fddd7450a48ad464229775f97fbfe8af36 | Author: Mike Travis | Date: Thu Dec 10 17:19:36 2009 -0800 | | x86: Limit the number of processor bootup messages removed the print_cpu_info() call from all APs. Put it back - it will only print MSRs when the user specifically requests them via show_msr=. Signed-off-by: Yinghai Lu Cc: Mike Travis Link: http://lkml.kernel.org/r/1329069237-11483-1-git-send-email-yinghai@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/kernel/smpboot.c | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..8bb062bbcbec 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -162,6 +162,7 @@ extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); +void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..8b6a3bb57d8e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -933,7 +933,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = { { 0xc0011000, 0xc001103b}, }; -static void __cpuinit print_cpu_msr(void) +static void __cpuinit __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -997,13 +997,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); -#ifdef CONFIG_SMP + __print_cpu_msr(); +} + +void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +{ if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif + __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..257049d7c657 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -791,9 +791,10 @@ do_rest: schedule(); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + print_cpu_msr(&cpu_data(cpu)); pr_debug("CPU%d: has booted.\n", cpu); - else { + } else { boot_error = 1; if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) == 0xA5A5A5A5) -- cgit v1.2.3 From 484546509ce5d49d43ec0a6eb2141c6bf3362bfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Feb 2012 09:40:30 -0500 Subject: x86/tracing: Denote the power and cpuidle tracepoints as _rcuidle() The power and cpuidle tracepoints are called within a rcu_idle_exit() section, and must be denoted with the _rcuidle() version of the tracepoint. Acked-by: Paul E. McKenney Reviewed-by: Josh Triplett Signed-off-by: Steven Rostedt --- arch/x86/kernel/process.c | 24 ++++++++++++------------ include/trace/events/power.h | 2 ++ 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..44eefde92109 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -377,8 +377,8 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle(1, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -391,8 +391,8 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); /* loop is done by the caller */ @@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle(1, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -461,8 +461,8 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else local_irq_enable(); } @@ -474,13 +474,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0, smp_processor_id()); - trace_cpu_idle(0, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } /* diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 1bcc2a8c00e2..14b38940062b 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -151,6 +151,8 @@ enum { events get removed */ static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {}; static inline void trace_power_end(u64 cpuid) {}; +static inline void trace_power_start_rcuidle(u64 type, u64 state, u64 cpuid) {}; +static inline void trace_power_end_rcuidle(u64 cpuid) {}; static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {}; #endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */ -- cgit v1.2.3 From 70142a9dd154f54f7409871ead86f7d77f2c6576 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 11 Feb 2012 22:56:59 +0000 Subject: x86/cpu: Fix overrun check in arch_print_cpu_modalias() snprintf() does not return a negative value when truncating. Signed-off-by: Ben Hutchings Acked-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 940e2d483076..2dfa52bcdfe2 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -67,7 +67,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev, for (i = 0; i < NCAPINTS*32; i++) { if (boot_cpu_has(i)) { n = snprintf(buf, size, ",%04X", i); - if (n < 0) { + if (n >= size) { WARN(1, "x86 features overflow page\n"); break; } -- cgit v1.2.3 From 5467bdda4a326513c2f14b712a22d59115b7ae94 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 11 Feb 2012 22:57:19 +0000 Subject: x86/cpu: Clean up modalias feature matching We currently include commas on both sides of the feature ID in a modalias, but this prevents the lowest numbered feature of a CPU from being matched. Since all feature IDs have the same length, we do not need to worry about substring matches, so omit commas from the modalias entirely. Avoid generating multiple adjacent wildcards when there is no feature ID to match. Signed-off-by: Ben Hutchings Acked-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/match.c | 3 +-- scripts/mod/file2alias.c | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 2dfa52bcdfe2..5502b289341b 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -63,7 +63,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev, boot_cpu_data.x86_model); size -= n; buf += n; - size -= 2; + size -= 1; for (i = 0; i < NCAPINTS*32; i++) { if (boot_cpu_has(i)) { n = snprintf(buf, size, ",%04X", i); @@ -75,7 +75,6 @@ ssize_t arch_print_cpu_modalias(struct device *dev, buf += n; } } - *buf++ = ','; *buf++ = '\n'; return buf - bufptr; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index a468af059834..78fd81fb9732 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1021,8 +1021,9 @@ static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id, ADD(alias, "vendor:", id->vendor != X86_VENDOR_ANY, id->vendor); ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family); ADD(alias, ":model:", id->model != X86_MODEL_ANY, id->model); - ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature); - strcat(alias, ",*"); + strcat(alias, ":feature:*"); + if (id->feature != X86_FEATURE_ANY) + sprintf(alias + strlen(alias), "%04X*", id->feature); return 1; } ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry); -- cgit v1.2.3 From 986cb48c5a4de0085db94d343b4e7dcf54355ec1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 11:46:36 -0800 Subject: x86-32/irq: Don't switch to irq stack for a user-mode irq If the irq happens in user mode, our kernel stack is empty (apart from the pt_regs themselves, of course), so there's no need or advantage to switch. And it really doesn't save any stack space, quite the reverse: it means that a nested interrupt cannot switch irq stacks. So instead of saving kernel stack space, it actually causes the potential for *more* stack usage. Also simplify the preemption count copy when we do switch stacks: just copy the whole preemption count, rather than just the softirq parts of it. There is no advantage to the partial copy: it is more effort to get a less correct result. Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202191139260.10000@i5.linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 40fc86161d92..58b7f27cb3e9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; - /* - * Copy the softirq bits in preempt_count so that the - * softirq checks work in the hardirq context. - */ - irqctx->tinfo.preempt_count = - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | - (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + /* Copy the preempt_count so that the [soft]irq checks work. */ + irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) if (unlikely(!desc)) return false; - if (!execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); desc->handle_irq(irq, desc); -- cgit v1.2.3 From 6bd330083e0e97b7ddc053459190bf3d5768ca83 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 6 Feb 2012 13:03:09 -0800 Subject: x86: Factor out TIF_IA32 from 32-bit address space Factor out IA32 (compatibility instruction set) from 32-bit address space in the thread_info flags; this is a precondition patch for x32 support. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org --- arch/x86/include/asm/elf.h | 4 ++-- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/include/asm/thread_info.h | 4 +++- arch/x86/kernel/process_64.c | 2 ++ arch/x86/kernel/sys_x86_64.c | 6 +++--- arch/x86/oprofile/backtrace.c | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df30d0f..410fa6a219f6 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -287,7 +287,7 @@ do { \ #define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */ /* 1GB for 64bit, 8MB for 32bit */ -#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff) +#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) #define ARCH_DLINFO \ do { \ @@ -330,7 +330,7 @@ static inline int mmap_is_ia32(void) return 1; #endif #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (test_thread_flag(TIF_ADDR32)) return 1; #endif return 0; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..9f748b5fb701 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -924,9 +924,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ +#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b443..d1803a495b35 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -86,7 +86,7 @@ struct thread_info { #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_IA32 17 /* 32bit process */ +#define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ @@ -95,6 +95,7 @@ struct thread_info { #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -116,6 +117,7 @@ struct thread_info { #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_ADDR32 (1 << TIF_ADDR32) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..0e900d09e232 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -508,6 +508,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ADDR32); /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -526,6 +527,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + set_thread_flag(TIF_ADDR32); current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..f921df8c2099 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -98,7 +98,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit @@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) + if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) && len <= mm->cached_hole_size) { mm->cached_hole_size = 0; mm->free_area_cache = begin; @@ -205,7 +205,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legact mmap base */ - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index bff89dfe3619..d6aa6e8315d1 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -67,7 +67,7 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) { struct stack_frame_ia32 *head; - /* User process is 32-bit */ + /* User process is IA32 */ if (!current || !test_thread_flag(TIF_IA32)) return 0; -- cgit v1.2.3 From bb2127240c5595ae4ef7115494f51e973692f64e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 13:56:49 -0800 Subject: x32: Add a thread flag for x32 processes An x32 process is *almost* the same thing as a 64-bit process with a 32-bit address limit, but there are a few minor differences -- in particular core dumps are 32 bits and signal handling is different. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/thread_info.h | 2 ++ arch/x86/kernel/process_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d1803a495b35..912e93511466 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -96,6 +96,7 @@ struct thread_info { #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ +#define TIF_X32 30 /* 32-bit native x86-64 binary */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,6 +119,7 @@ struct thread_info { #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) +#define _TIF_X32 (1 << TIF_X32) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e900d09e232..5fe2fbaa56ba 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -509,6 +509,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -528,6 +529,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); set_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ -- cgit v1.2.3 From 6630f11ba54414b9870d87dfef2bee467bfa842a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 14:18:50 -0800 Subject: x32: Add x32 system calls to syscall/syscall_64.tbl Split the 64-bit system calls into "64" (64-bit only) and "common" (64-bit or x32) and add the x32 system call numbers. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 2 + arch/x86/kernel/syscall_64.c | 3 + arch/x86/syscalls/Makefile | 2 +- arch/x86/syscalls/syscall_64.tbl | 579 +++++++++++++++++++++------------------ arch/x86/um/sys_call_table_64.c | 3 + arch/x86/um/user-offsets.c | 2 + 6 files changed, 317 insertions(+), 274 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 834e897b1e25..c3354f7b0a06 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,6 +1,8 @@ #include #define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ static char syscalls_64[] = { #include }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 7ac7943be02c..26c4ca1f20e8 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,6 +5,9 @@ #include #include +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ + #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include #undef __SYSCALL_64 diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 564b2476fede..89dd9581c5a2 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -24,7 +24,7 @@ syshdr_pfx_unistd_32_ia32 := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) -syshdr_abi_unistd_64 := 64 +syshdr_abi_unistd_64 := common,64 $(out)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index b440a8f7eefa..4aecc7e31166 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -4,317 +4,350 @@ # The format is: # # -# The abi is always "64" for this file (for now.) +# The abi is "common", "64" or "x32" for this file. # -0 64 read sys_read -1 64 write sys_write -2 64 open sys_open -3 64 close sys_close -4 64 stat sys_newstat -5 64 fstat sys_newfstat -6 64 lstat sys_newlstat -7 64 poll sys_poll -8 64 lseek sys_lseek -9 64 mmap sys_mmap -10 64 mprotect sys_mprotect -11 64 munmap sys_munmap -12 64 brk sys_brk +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction -14 64 rt_sigprocmask sys_rt_sigprocmask +14 common rt_sigprocmask sys_rt_sigprocmask 15 64 rt_sigreturn stub_rt_sigreturn 16 64 ioctl sys_ioctl -17 64 pread64 sys_pread64 -18 64 pwrite64 sys_pwrite64 +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 19 64 readv sys_readv 20 64 writev sys_writev -21 64 access sys_access -22 64 pipe sys_pipe -23 64 select sys_select -24 64 sched_yield sys_sched_yield -25 64 mremap sys_mremap -26 64 msync sys_msync -27 64 mincore sys_mincore -28 64 madvise sys_madvise -29 64 shmget sys_shmget -30 64 shmat sys_shmat -31 64 shmctl sys_shmctl -32 64 dup sys_dup -33 64 dup2 sys_dup2 -34 64 pause sys_pause -35 64 nanosleep sys_nanosleep -36 64 getitimer sys_getitimer -37 64 alarm sys_alarm -38 64 setitimer sys_setitimer -39 64 getpid sys_getpid -40 64 sendfile sys_sendfile64 -41 64 socket sys_socket -42 64 connect sys_connect -43 64 accept sys_accept -44 64 sendto sys_sendto +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto 45 64 recvfrom sys_recvfrom 46 64 sendmsg sys_sendmsg 47 64 recvmsg sys_recvmsg -48 64 shutdown sys_shutdown -49 64 bind sys_bind -50 64 listen sys_listen -51 64 getsockname sys_getsockname -52 64 getpeername sys_getpeername -53 64 socketpair sys_socketpair -54 64 setsockopt sys_setsockopt -55 64 getsockopt sys_getsockopt -56 64 clone stub_clone -57 64 fork stub_fork -58 64 vfork stub_vfork +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 common setsockopt sys_setsockopt +55 common getsockopt sys_getsockopt +56 common clone stub_clone +57 common fork stub_fork +58 common vfork stub_vfork 59 64 execve stub_execve -60 64 exit sys_exit -61 64 wait4 sys_wait4 -62 64 kill sys_kill -63 64 uname sys_newuname -64 64 semget sys_semget -65 64 semop sys_semop -66 64 semctl sys_semctl -67 64 shmdt sys_shmdt -68 64 msgget sys_msgget -69 64 msgsnd sys_msgsnd -70 64 msgrcv sys_msgrcv -71 64 msgctl sys_msgctl -72 64 fcntl sys_fcntl -73 64 flock sys_flock -74 64 fsync sys_fsync -75 64 fdatasync sys_fdatasync -76 64 truncate sys_truncate -77 64 ftruncate sys_ftruncate -78 64 getdents sys_getdents -79 64 getcwd sys_getcwd -80 64 chdir sys_chdir -81 64 fchdir sys_fchdir -82 64 rename sys_rename -83 64 mkdir sys_mkdir -84 64 rmdir sys_rmdir -85 64 creat sys_creat -86 64 link sys_link -87 64 unlink sys_unlink -88 64 symlink sys_symlink -89 64 readlink sys_readlink -90 64 chmod sys_chmod -91 64 fchmod sys_fchmod -92 64 chown sys_chown -93 64 fchown sys_fchown -94 64 lchown sys_lchown -95 64 umask sys_umask -96 64 gettimeofday sys_gettimeofday -97 64 getrlimit sys_getrlimit -98 64 getrusage sys_getrusage -99 64 sysinfo sys_sysinfo +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo 100 64 times sys_times -101 64 ptrace sys_ptrace -102 64 getuid sys_getuid -103 64 syslog sys_syslog -104 64 getgid sys_getgid -105 64 setuid sys_setuid -106 64 setgid sys_setgid -107 64 geteuid sys_geteuid -108 64 getegid sys_getegid -109 64 setpgid sys_setpgid -110 64 getppid sys_getppid -111 64 getpgrp sys_getpgrp -112 64 setsid sys_setsid -113 64 setreuid sys_setreuid -114 64 setregid sys_setregid -115 64 getgroups sys_getgroups -116 64 setgroups sys_setgroups -117 64 setresuid sys_setresuid -118 64 getresuid sys_getresuid -119 64 setresgid sys_setresgid -120 64 getresgid sys_getresgid -121 64 getpgid sys_getpgid -122 64 setfsuid sys_setfsuid -123 64 setfsgid sys_setfsgid -124 64 getsid sys_getsid -125 64 capget sys_capget -126 64 capset sys_capset +101 common ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset 127 64 rt_sigpending sys_rt_sigpending 128 64 rt_sigtimedwait sys_rt_sigtimedwait 129 64 rt_sigqueueinfo sys_rt_sigqueueinfo -130 64 rt_sigsuspend sys_rt_sigsuspend +130 common rt_sigsuspend sys_rt_sigsuspend 131 64 sigaltstack stub_sigaltstack -132 64 utime sys_utime -133 64 mknod sys_mknod +132 common utime sys_utime +133 common mknod sys_mknod 134 64 uselib -135 64 personality sys_personality -136 64 ustat sys_ustat -137 64 statfs sys_statfs -138 64 fstatfs sys_fstatfs -139 64 sysfs sys_sysfs -140 64 getpriority sys_getpriority -141 64 setpriority sys_setpriority -142 64 sched_setparam sys_sched_setparam -143 64 sched_getparam sys_sched_getparam -144 64 sched_setscheduler sys_sched_setscheduler -145 64 sched_getscheduler sys_sched_getscheduler -146 64 sched_get_priority_max sys_sched_get_priority_max -147 64 sched_get_priority_min sys_sched_get_priority_min -148 64 sched_rr_get_interval sys_sched_rr_get_interval -149 64 mlock sys_mlock -150 64 munlock sys_munlock -151 64 mlockall sys_mlockall -152 64 munlockall sys_munlockall -153 64 vhangup sys_vhangup -154 64 modify_ldt sys_modify_ldt -155 64 pivot_root sys_pivot_root +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root 156 64 _sysctl sys_sysctl -157 64 prctl sys_prctl -158 64 arch_prctl sys_arch_prctl -159 64 adjtimex sys_adjtimex -160 64 setrlimit sys_setrlimit -161 64 chroot sys_chroot -162 64 sync sys_sync -163 64 acct sys_acct -164 64 settimeofday sys_settimeofday -165 64 mount sys_mount -166 64 umount2 sys_umount -167 64 swapon sys_swapon -168 64 swapoff sys_swapoff -169 64 reboot sys_reboot -170 64 sethostname sys_sethostname -171 64 setdomainname sys_setdomainname -172 64 iopl stub_iopl -173 64 ioperm sys_ioperm +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl stub_iopl +173 common ioperm sys_ioperm 174 64 create_module -175 64 init_module sys_init_module -176 64 delete_module sys_delete_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module 177 64 get_kernel_syms 178 64 query_module -179 64 quotactl sys_quotactl +179 common quotactl sys_quotactl 180 64 nfsservctl -181 64 getpmsg -182 64 putpmsg -183 64 afs_syscall -184 64 tuxcall -185 64 security -186 64 gettid sys_gettid -187 64 readahead sys_readahead -188 64 setxattr sys_setxattr -189 64 lsetxattr sys_lsetxattr -190 64 fsetxattr sys_fsetxattr -191 64 getxattr sys_getxattr -192 64 lgetxattr sys_lgetxattr -193 64 fgetxattr sys_fgetxattr -194 64 listxattr sys_listxattr -195 64 llistxattr sys_llistxattr -196 64 flistxattr sys_flistxattr -197 64 removexattr sys_removexattr -198 64 lremovexattr sys_lremovexattr -199 64 fremovexattr sys_fremovexattr -200 64 tkill sys_tkill -201 64 time sys_time -202 64 futex sys_futex -203 64 sched_setaffinity sys_sched_setaffinity -204 64 sched_getaffinity sys_sched_getaffinity +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity 205 64 set_thread_area -206 64 io_setup sys_io_setup -207 64 io_destroy sys_io_destroy -208 64 io_getevents sys_io_getevents -209 64 io_submit sys_io_submit -210 64 io_cancel sys_io_cancel +206 common io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 common io_submit sys_io_submit +210 common io_cancel sys_io_cancel 211 64 get_thread_area -212 64 lookup_dcookie sys_lookup_dcookie -213 64 epoll_create sys_epoll_create +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old -216 64 remap_file_pages sys_remap_file_pages -217 64 getdents64 sys_getdents64 -218 64 set_tid_address sys_set_tid_address -219 64 restart_syscall sys_restart_syscall -220 64 semtimedop sys_semtimedop -221 64 fadvise64 sys_fadvise64 +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 222 64 timer_create sys_timer_create -223 64 timer_settime sys_timer_settime -224 64 timer_gettime sys_timer_gettime -225 64 timer_getoverrun sys_timer_getoverrun -226 64 timer_delete sys_timer_delete -227 64 clock_settime sys_clock_settime -228 64 clock_gettime sys_clock_gettime -229 64 clock_getres sys_clock_getres -230 64 clock_nanosleep sys_clock_nanosleep -231 64 exit_group sys_exit_group -232 64 epoll_wait sys_epoll_wait -233 64 epoll_ctl sys_epoll_ctl -234 64 tgkill sys_tgkill -235 64 utimes sys_utimes +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes 236 64 vserver -237 64 mbind sys_mbind -238 64 set_mempolicy sys_set_mempolicy -239 64 get_mempolicy sys_get_mempolicy -240 64 mq_open sys_mq_open -241 64 mq_unlink sys_mq_unlink -242 64 mq_timedsend sys_mq_timedsend -243 64 mq_timedreceive sys_mq_timedreceive +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive 244 64 mq_notify sys_mq_notify -245 64 mq_getsetattr sys_mq_getsetattr +245 common mq_getsetattr sys_mq_getsetattr 246 64 kexec_load sys_kexec_load 247 64 waitid sys_waitid -248 64 add_key sys_add_key -249 64 request_key sys_request_key -250 64 keyctl sys_keyctl -251 64 ioprio_set sys_ioprio_set -252 64 ioprio_get sys_ioprio_get -253 64 inotify_init sys_inotify_init -254 64 inotify_add_watch sys_inotify_add_watch -255 64 inotify_rm_watch sys_inotify_rm_watch -256 64 migrate_pages sys_migrate_pages -257 64 openat sys_openat -258 64 mkdirat sys_mkdirat -259 64 mknodat sys_mknodat -260 64 fchownat sys_fchownat -261 64 futimesat sys_futimesat -262 64 newfstatat sys_newfstatat -263 64 unlinkat sys_unlinkat -264 64 renameat sys_renameat -265 64 linkat sys_linkat -266 64 symlinkat sys_symlinkat -267 64 readlinkat sys_readlinkat -268 64 fchmodat sys_fchmodat -269 64 faccessat sys_faccessat -270 64 pselect6 sys_pselect6 -271 64 ppoll sys_ppoll -272 64 unshare sys_unshare +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare 273 64 set_robust_list sys_set_robust_list 274 64 get_robust_list sys_get_robust_list -275 64 splice sys_splice -276 64 tee sys_tee -277 64 sync_file_range sys_sync_file_range +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range 278 64 vmsplice sys_vmsplice 279 64 move_pages sys_move_pages -280 64 utimensat sys_utimensat -281 64 epoll_pwait sys_epoll_pwait -282 64 signalfd sys_signalfd -283 64 timerfd_create sys_timerfd_create -284 64 eventfd sys_eventfd -285 64 fallocate sys_fallocate -286 64 timerfd_settime sys_timerfd_settime -287 64 timerfd_gettime sys_timerfd_gettime -288 64 accept4 sys_accept4 -289 64 signalfd4 sys_signalfd4 -290 64 eventfd2 sys_eventfd2 -291 64 epoll_create1 sys_epoll_create1 -292 64 dup3 sys_dup3 -293 64 pipe2 sys_pipe2 -294 64 inotify_init1 sys_inotify_init1 +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 295 64 preadv sys_preadv 296 64 pwritev sys_pwritev 297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -298 64 perf_event_open sys_perf_event_open +298 common perf_event_open sys_perf_event_open 299 64 recvmmsg sys_recvmmsg -300 64 fanotify_init sys_fanotify_init -301 64 fanotify_mark sys_fanotify_mark -302 64 prlimit64 sys_prlimit64 -303 64 name_to_handle_at sys_name_to_handle_at -304 64 open_by_handle_at sys_open_by_handle_at -305 64 clock_adjtime sys_clock_adjtime -306 64 syncfs sys_syncfs +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs 307 64 sendmmsg sys_sendmmsg -308 64 setns sys_setns -309 64 getcpu sys_getcpu +308 common setns sys_setns +309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction sys32_rt_sigaction +513 x32 rt_sigreturn stub_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve stub_x32_execve +521 x32 times compat_sys_times +522 x32 rt_sigpending sys32_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo +525 x32 sigaltstack stub_x32_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index fe626c3ba01b..9924776f4265 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,6 +35,9 @@ #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ + #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 5edf4f4bbf53..ce7e3607a870 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -15,6 +15,8 @@ static char syscalls[] = { }; #else #define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ static char syscalls[] = { #include }; -- cgit v1.2.3 From f28f0c23576662fb293defe9b1884d5a6e1bd85c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:38:43 -0800 Subject: x86: Move some signal-handling definitions to a common header There are some definitions which are duplicated between kernel/signal.c and ia32/ia32_signal.c; move them to a common header file. Rather than adding stuff to existing header files which contain data structures, create a new header file; hence the slightly odd name ("all the good ones were taken.") Note: nothing relied on signal_fault() being defined in . Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 12 ++---------- arch/x86/include/asm/ptrace.h | 1 - arch/x86/include/asm/sighandling.h | 19 +++++++++++++++++++ arch/x86/kernel/signal.c | 10 +--------- 4 files changed, 22 insertions(+), 20 deletions(-) create mode 100644 arch/x86/include/asm/sighandling.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 65577698cab2..25d80f3faf2e 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -12,10 +12,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -31,16 +29,10 @@ #include #include #include +#include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +#define FIX_EFLAGS __FIX_EFLAGS int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 35664547125b..dcfde52979c3 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -145,7 +145,6 @@ extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h new file mode 100644 index 000000000000..843e299e120e --- /dev/null +++ b/arch/x86/include/asm/sighandling.h @@ -0,0 +1,19 @@ +#ifndef _ASM_X86_SIGHANDLING_H +#define _ASM_X86_SIGHANDLING_H + +#include +#include +#include + +#include + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ + X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ + X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ + X86_EFLAGS_CF) + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + +#endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e2..c432dc0e65f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -10,10 +10,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -26,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -37,13 +36,6 @@ #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - #ifdef CONFIG_X86_32 # define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) #else -- cgit v1.2.3 From 851394229e79c11b0b5b74c509817848e9a80564 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:43:09 -0800 Subject: x32: Export setup/restore_sigcontext from signal.c Export setup_sigcontext() and restore_sigcontext() from signal.c, so we can use the 64-bit versions verbatim for x32. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sighandling.h | 5 +++++ arch/x86/kernel/signal.c | 10 ++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 843e299e120e..ada93b3b8c66 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -16,4 +16,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax); +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask); + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c432dc0e65f0..450fb255f877 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -60,9 +60,8 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) { void __user *buf; unsigned int tmpflags; @@ -117,9 +116,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) { int err = 0; -- cgit v1.2.3 From fca460f95e928bae373daa8295877b6905bc62b8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:56:26 -0800 Subject: x32: Handle the x32 system call flag x32 shares most system calls with x86-64, but unfortunately some subsystem (the input subsystem is the chief offender) which require is_compat() when operating with a 32-bit userspace. The input system actually has text files in sysfs whose meaning is dependent on sizeof(long) in userspace! We could solve this by having two completely disjoint system call tables; requiring that each system call be duplicated. This patch takes a different approach: we add a flag to the system call number; this flag doesn't affect the system call dispatch but requests compat treatment from affected subsystems for the duration of the system call. The change of cmpq to cmpl is safe since it immediately follows the and. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/compat.h | 13 +++++++++++-- arch/x86/include/asm/syscall.h | 5 +++-- arch/x86/include/asm/unistd.h | 7 +++++++ arch/x86/kernel/entry_64.S | 10 ++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 30d737ef2a42..7938b84e4506 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -7,6 +7,7 @@ #include #include #include +#include #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" @@ -212,9 +213,17 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)regs->sp - len; } -static inline int is_compat_task(void) +static inline bool is_compat_task(void) { - return current_thread_info()->status & TS_COMPAT; +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & TS_COMPAT) + return true; +#endif +#ifdef CONFIG_X86_X32_ABI + if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) + return true; +#endif + return false; } #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d962e5652a73..386b78686c4d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,7 @@ #include #include #include /* For NR_syscalls */ +#include extern const unsigned long sys_call_table[]; @@ -26,13 +27,13 @@ extern const unsigned long sys_call_table[]; */ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - return regs->orig_ax; + return regs->orig_ax & __SYSCALL_MASK; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { - regs->ax = regs->orig_ax; + regs->ax = regs->orig_ax & __SYSCALL_MASK; } static inline long syscall_get_error(struct task_struct *task, diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 7a48a5557470..37cdc9d99bb1 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -5,6 +5,13 @@ #define __X32_SYSCALL_BIT 0x40000000 #ifdef __KERNEL__ + +# ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) +# else +# define __SYSCALL_MASK (~0) +# endif + # ifdef CONFIG_X86_32 # include diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..a17b34216971 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -482,7 +482,12 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja badsys movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative @@ -596,7 +601,12 @@ tracesys: */ LOAD_ARGS ARGOFFSET, 1 RESTORE_REST +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) -- cgit v1.2.3 From c5a373942bbc41698724fc948c74f959f73407e5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 09:41:09 -0800 Subject: x32: Signal-related system calls x32 uses the 64-bit signal frame format, obviously, but there are some structures which mixes that with pointers or sizeof(long) types, as such we have to create a handful of system calls specific to x32. By and large these are a mixture of the 64-bit and the compat system calls. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 19 ++++++++ arch/x86/kernel/signal.c | 118 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a17b34216971..53dc821f0a62 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -746,6 +746,25 @@ ENTRY(stub_rt_sigreturn) CFI_ENDPROC END(stub_rt_sigreturn) +#ifdef CONFIG_X86_X32_ABI + PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx + +ENTRY(stub_x32_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys32_x32_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_rt_sigreturn) + +#endif + /* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 450fb255f877..c3846b6fb726 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -29,6 +29,7 @@ #ifdef CONFIG_X86_64 #include #include +#include #endif /* CONFIG_X86_64 */ #include @@ -632,6 +633,16 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +#ifdef CONFIG_X86_X32_ABI +#define is_x32 test_thread_flag(TIF_X32) + +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs); +#else /* !CONFIG_X86_X32_ABI */ +#define is_x32 0 +#endif /* CONFIG_X86_X32_ABI */ + int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs); int ia32_setup_frame(int sig, struct k_sigaction *ka, @@ -656,8 +667,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else ret = ia32_setup_frame(usig, ka, set, regs); - } else +#ifdef CONFIG_X86_X32_ABI + } else if (is_x32) { + ret = x32_setup_rt_frame(usig, ka, info, + (compat_sigset_t *)set, regs); +#endif + } else { ret = __setup_rt_frame(sig, ka, info, set, regs); + } if (ret) { force_sigsegv(sig, current); @@ -840,3 +857,102 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV, me); } + +#ifdef CONFIG_X86_X32_ABI +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + put_user_ex(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + put_user_ex(0, &frame->uc.uc__pad0); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; +} + +asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long ax; + struct pt_regs tregs; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + tregs = *regs; + if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif -- cgit v1.2.3 From d1a797f388d6d30fa502915d1b9937ed758b7137 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 10:06:34 -0800 Subject: x32: Handle process creation Allow an x32 process to be started. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Cc: Peter Zijlstra --- arch/x86/include/asm/compat.h | 26 ++++++++++++++++++++++++-- arch/x86/include/asm/elf.h | 25 +++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event.c | 4 +++- arch/x86/kernel/entry_64.S | 15 +++++++++++++++ arch/x86/kernel/process_64.c | 23 ++++++++++++++++------- 5 files changed, 79 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7938b84e4506..e7f68b49c01a 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include @@ -187,7 +188,20 @@ struct compat_shmid64_ds { /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ +#ifdef CONFIG_X86_X32_ABI +typedef struct user_regs_struct compat_elf_gregset_t; + +#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216) +#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296) +#define SET_PR_FPVALID(S,V) \ + do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \ + while (0) + +#define COMPAT_USE_64BIT_TIME \ + (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) +#else typedef struct user_regs_struct32 compat_elf_gregset_t; +#endif /* * A pointer passed in from user mode. This should not @@ -209,8 +223,16 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) static inline void __user *arch_compat_alloc_user_space(long len) { - struct pt_regs *regs = task_pt_regs(current); - return (void __user *)regs->sp - len; + compat_uptr_t sp; + + if (test_thread_flag(TIF_IA32)) { + sp = task_pt_regs(current)->sp; + } else { + /* -128 for the x32 ABI redzone */ + sp = percpu_read(old_rsp) - 128; + } + + return (void __user *)round_down(sp - len, 16); } static inline bool is_compat_task(void) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 410fa6a219f6..83aabea95dd7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -156,7 +156,12 @@ do { \ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) -#define compat_elf_check_arch(x) elf_check_arch_ia32(x) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) + +#if __USER32_DS != __USER_DS +# error "The following code assumes __USER32_DS == __USER_DS" +#endif static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) @@ -179,8 +184,9 @@ static inline void elf_common_init(struct thread_struct *t, void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -void set_personality_ia32(void); -#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() +void set_personality_ia32(bool); +#define COMPAT_SET_PERSONALITY(ex) \ + set_personality_ia32((ex).e_machine == EM_X86_64) #define COMPAT_ELF_PLATFORM ("i686") @@ -296,9 +302,20 @@ do { \ (unsigned long)current->mm->context.vdso); \ } while (0) +#define ARCH_DLINFO_X32 \ +do { \ + if (vdso_enabled) \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso); \ +} while (0) + #define AT_SYSINFO 32 -#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32) +#define COMPAT_ARCH_DLINFO \ +if (test_thread_flag(TIF_X32)) \ + ARCH_DLINFO_X32; \ +else \ + ARCH_DLINFO_IA32(sysctl_vsyscall32) #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..63c0e058a405 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -1595,6 +1594,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) } #ifdef CONFIG_COMPAT + +#include + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 53dc821f0a62..9e036f0ce5e0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -763,6 +763,21 @@ ENTRY(stub_x32_rt_sigreturn) CFI_ENDPROC END(stub_x32_rt_sigreturn) +ENTRY(stub_x32_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys32_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execve) + #endif /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5fe2fbaa56ba..a0701da2bd18 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -364,7 +364,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, - __USER32_CS, __USER32_DS, __USER32_DS); + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); } #endif @@ -508,6 +510,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); @@ -522,22 +525,28 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(void) +void set_personality_ia32(bool x32) { /* inherit personality from parent */ /* Make sure to be in 32bit mode */ - set_thread_flag(TIF_IA32); set_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); - current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ if (current->mm) current->mm->context.ia32_compat = 1; - /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + if (x32) { + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + current->personality &= ~READ_IMPLIES_EXEC; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; + } } unsigned long get_wchan(struct task_struct *p) -- cgit v1.2.3 From a06c9bc0647f66df0534fb887ddf6cddd35f426c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 11:08:37 -0800 Subject: x32: If configured, add x32 system calls to system call tables If CONFIG_X86_X32_ABI is defined, add the x32 system calls to the system call tables. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 6 +++++- arch/x86/kernel/syscall_64.c | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index c3354f7b0a06..1b4754f82ba7 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -2,7 +2,11 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = 1, #define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif static char syscalls_64[] = { #include }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 26c4ca1f20e8..5c7f8c20da74 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -6,7 +6,12 @@ #include #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include -- cgit v1.2.3 From 8546c008924d5fd1724fa698eaa92b414bafd50d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 10:25:45 -0800 Subject: i387: Uninline the generic FP helpers that we expose to kernel modules Instead of exporting the very low-level internals of the FPU state save/restore code (ie things like 'fpu_owner_task'), we should export the higher-level interfaces. Inlining these things is pointless anyway: sure, sometimes the end result is small, but while 'stts()' can result in just three x86 instructions, those are not cheap instructions (writing %cr0 is a serializing instruction and a very slow one at that). So the overhead of a function call is not noticeable, and we really don't want random modules mucking about with our internal state save logic anyway. So this unexports 'fpu_owner_task', and instead uninlines and exports the actual functions that modules can use: fpu_kernel_begin/end() and unlazy_fpu(). Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211339590.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 78 +++--------------------------------------- arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/i387.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 76 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 247904945d3f..0c1031d354f2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -419,70 +419,9 @@ static inline void __clear_fpu(struct task_struct *tsk) } } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - */ -static inline bool interrupted_kernel_fpu_idle(void) -{ - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static inline bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); -} - -/* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. - */ -static inline bool irq_fpu_usable(void) -{ - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); -} - -static inline void kernel_fpu_begin(void) -{ - struct task_struct *me = current; - - WARN_ON_ONCE(!irq_fpu_usable()); - preempt_disable(); - if (__thread_has_fpu(me)) { - __save_init_fpu(me); - __thread_clear_has_fpu(me); - /* We do 'stts()' in kernel_fpu_end() */ - } else { - percpu_write(fpu_owner_task, NULL); - clts(); - } -} - -static inline void kernel_fpu_end(void) -{ - stts(); - preempt_enable(); -} +extern bool irq_fpu_usable(void); +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); /* * Some instructions like VIA's padlock instructions generate a spurious @@ -566,16 +505,7 @@ static inline void save_init_fpu(struct task_struct *tsk) preempt_enable(); } -static inline void unlazy_fpu(struct task_struct *tsk) -{ - preempt_disable(); - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; - preempt_enable(); -} +extern void unlazy_fpu(struct task_struct *tsk); static inline void clear_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318f..cb71b01ab66e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,7 +1045,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1115,7 +1114,6 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d8598f789..17b7549c4134 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -32,6 +32,86 @@ # define user32_fxsr_struct user_fxsr_struct #endif +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + __thread_clear_has_fpu(me); + /* We do 'stts()' in kernel_fpu_end() */ + } else { + percpu_write(fpu_owner_task, NULL); + clts(); + } +} +EXPORT_SYMBOL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + stts(); + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; + preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); + #ifdef CONFIG_MATH_EMULATION # define HAVE_HWFP (boot_cpu_data.hard_math) #else -- cgit v1.2.3 From 1361b83a13d4d92e53fbb6c877528713e118b821 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 13:19:22 -0800 Subject: i387: Split up into exported and internal interfaces While various modules include to get access to things we actually *intend* for them to use, most of that header file was really pretty low-level internal stuff that we really don't want to expose to others. So split the header file into two: the small exported interfaces remain in , while the internal definitions that are only used by core architecture code are now in . The guiding principle for this was to expose functions that we export to modules, and leave them in , while stuff that is used by task switching or was marked GPL-only is in . The fpu-internal.h file could be further split up too, especially since arch/x86/kvm/ uses some of the remaining stuff for its module. But that kvm usage should probably be abstracted out a bit, and at least now the internal FPU accessor functions are much more contained. Even if it isn't perhaps as contained as it _could_ be. Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211340330.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 1 + arch/x86/include/asm/fpu-internal.h | 520 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/i387.h | 512 +---------------------------------- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/i387.c | 3 +- arch/x86/kernel/process.c | 1 + arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/signal.c | 1 + arch/x86/kernel/traps.c | 1 + arch/x86/kernel/xsave.c | 1 + arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 1 + arch/x86/power/cpu.c | 1 + 15 files changed, 540 insertions(+), 508 deletions(-) create mode 100644 arch/x86/include/asm/fpu-internal.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 65577698cab2..5563ba1cf513 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h new file mode 100644 index 000000000000..4fa88154e4de --- /dev/null +++ b/arch/x86/include/asm/fpu-internal.h @@ -0,0 +1,520 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _FPU_INTERNAL_H +#define _FPU_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern unsigned int sig_xstate_size; +extern void fpu_init(void); + +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + +extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active + +extern struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +extern unsigned int sig_xstate_ia32_size; +extern struct _fpx_sw_bytes fx_sw_reserved_ia32; +struct _fpstate_ia32; +struct _xstate_ia32; +extern int save_i387_xstate_ia32(void __user *buf); +extern int restore_i387_xstate_ia32(void __user *buf); +#endif + +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + +#define X87_FSW_ES (1 << 7) /* Exception Summary */ + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has(X86_FEATURE_FXSR); +} + +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + +#ifdef CONFIG_X86_64 +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + int err; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxrstorq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "m" (*fx), "0" (0)); +#else + asm volatile("1: rex64/fxrstor (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "R" (fx), "m" (*fx), "0" (0)); +#endif + return err; +} + +static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +{ + int err; + + /* + * Clear the bytes not touched by the fxsave and reserved + * for the SW usage. + */ + err = __clear_user(&fx->sw_reserved, + sizeof(struct _fpx_sw_bytes)); + if (unlikely(err)) + return -EFAULT; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxsaveq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), [fx] "=m" (*fx) + : "0" (0)); +#else + asm volatile("1: rex64/fxsave (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=m" (*fx) + : [fx] "R" (fx), "0" (0)); +#endif + if (unlikely(err) && + __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; + /* No need to clear here because the caller clears USED_MATH */ + return err; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + /* Using "rex64; fxsave %0" is broken because, if the memory operand + uses any extended registers for addressing, a second REX prefix + will be generated (to the assembler, rex64 followed by semicolon + is a separate instruction), and hence the 64-bitness is lost. */ + +#ifdef CONFIG_AS_FXSAVEQ + /* Using "fxsaveq %0" would be the ideal choice, but is only supported + starting with gas 2.16. */ + __asm__ __volatile__("fxsaveq %0" + : "=m" (fpu->state->fxsave)); +#else + /* Using, as a workaround, the properly prefixed form below isn't + accepted by any binutils version so far released, complaining that + the same type of prefix is used twice if an extended register is + needed for addressing (fix submitted to mainline 2005-11-21). + asm volatile("rex64/fxsave %0" + : "=m" (fpu->state->fxsave)); + This, however, we can work around by forcing the compiler to select + an addressing mode that doesn't require extended registers. */ + asm volatile("rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); +#endif +} + +#else /* CONFIG_X86_32 */ + +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + /* + * The "nop" is needed to make the instructions the same + * length. + */ + alternative_input( + "nop ; frstor %1", + "fxrstor %1", + X86_FEATURE_FXSR, + "m" (*fx)); + + return 0; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + asm volatile("fxsave %[fx]" + : [fx] "=m" (fpu->state->fxsave)); +} + +#endif /* CONFIG_X86_64 */ + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. + */ +static inline int fpu_save_init(struct fpu *fpu) +{ + if (use_xsave()) { + fpu_xsave(fpu); + + /* + * xsave header may indicate the init state of the FP. + */ + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return 1; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fnsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return 0; + } + + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { + asm volatile("fnclex"); + return 0; + } + return 1; +} + +static inline int __save_init_fpu(struct task_struct *tsk) +{ + return fpu_save_init(&tsk->thread.fpu); +} + +static inline int fpu_fxrstor_checking(struct fpu *fpu) +{ + return fxrstor_checking(&fpu->state->fxsave); +} + +static inline int fpu_restore_checking(struct fpu *fpu) +{ + if (use_xsave()) + return fpu_xrstor_checking(fpu); + else + return fpu_fxrstor_checking(fpu); +} + +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.fpu.has_fpu)); + + return fpu_restore_checking(&tsk->thread.fpu); +} + +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct task_struct *tsk) +{ + return tsk->thread.fpu.has_fpu; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct task_struct *tsk) +{ + __thread_clear_has_fpu(tsk); + stts(); +} + +static inline void __thread_fpu_begin(struct task_struct *tsk) +{ + clts(); + __thread_set_has_fpu(tsk); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new->fpu_counter++; + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; + if (fpu.preload) { + new->fpu_counter++; + if (fpu_lazy_restore(new, cpu)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) +{ + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } +} + +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + +static inline void __clear_fpu(struct task_struct *tsk) +{ + if (__thread_has_fpu(tsk)) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(tsk); + } +} + +/* + * The actual user_fpu_begin/end() functions + * need to be preemption-safe. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline void user_fpu_end(void) +{ + preempt_disable(); + __thread_fpu_end(current); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) + __thread_fpu_begin(current); + preempt_enable(); +} + +/* + * These disable preemption on their own and are safe + */ +static inline void save_init_fpu(struct task_struct *tsk) +{ + WARN_ON_ONCE(!__thread_has_fpu(tsk)); + preempt_disable(); + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + preempt_enable(); +} + +static inline void clear_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __clear_fpu(tsk); + preempt_enable(); +} + +/* + * i387 state interaction + */ +static inline unsigned short get_fpu_cwd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.cwd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.swd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) +{ + if (cpu_has_xmm) { + return tsk->thread.fpu.state->fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +static bool fpu_allocated(struct fpu *fpu) +{ + return fpu->state != NULL; +} + +static inline int fpu_alloc(struct fpu *fpu) +{ + if (fpu_allocated(fpu)) + return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + WARN_ON((unsigned long)fpu->state & 15); + return 0; +} + +static inline void fpu_free(struct fpu *fpu) +{ + if (fpu->state) { + kmem_cache_free(task_xstate_cachep, fpu->state); + fpu->state = NULL; + } +} + +static inline void fpu_copy(struct fpu *dst, struct fpu *src) +{ + memcpy(dst->state, src->state, xstate_size); +} + +extern void fpu_finit(struct fpu *fpu); + +#endif diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0c1031d354f2..7ce0798b1b26 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -13,411 +13,15 @@ #ifndef __ASSEMBLY__ #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +struct pt_regs; +struct user_i387_struct; -extern unsigned int sig_xstate_size; -extern void fpu_init(void); -extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); - -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - int err; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxrstorq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "m" (*fx), "0" (0)); -#else - asm volatile("1: rex64/fxrstor (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif - return err; -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - int err; - - /* - * Clear the bytes not touched by the fxsave and reserved - * for the SW usage. - */ - err = __clear_user(&fx->sw_reserved, - sizeof(struct _fpx_sw_bytes)); - if (unlikely(err)) - return -EFAULT; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), [fx] "=m" (*fx) - : "0" (0)); -#else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=m" (*fx) - : [fx] "R" (fx), "0" (0)); -#endif - if (unlikely(err) && - __clear_user(fx, sizeof(struct i387_fxsave_struct))) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ - return err; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - /* Using "rex64; fxsave %0" is broken because, if the memory operand - uses any extended registers for addressing, a second REX prefix - will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ - -#ifdef CONFIG_AS_FXSAVEQ - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#else - /* Using, as a workaround, the properly prefixed form below isn't - accepted by any binutils version so far released, complaining that - the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). - asm volatile("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); - This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - asm volatile("rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); -#endif -} - -#else /* CONFIG_X86_32 */ - -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - /* - * The "nop" is needed to make the instructions the same - * length. - */ - alternative_input( - "nop ; frstor %1", - "fxrstor %1", - X86_FEATURE_FXSR, - "m" (*fx)); - - return 0; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - asm volatile("fxsave %[fx]" - : [fx] "=m" (fpu->state->fxsave)); -} - -#endif /* CONFIG_X86_64 */ - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - fpu_xsave(fpu); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ - return fxrstor_checking(&fpu->state->fxsave); -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(fpu); - else - return fpu_fxrstor_checking(fpu); -} - -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.fpu.has_fpu)); - - return fpu_restore_checking(&tsk->thread.fpu); -} - -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 0; - percpu_write(fpu_owner_task, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 1; - percpu_write(fpu_owner_task, tsk); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct task_struct *tsk) -{ - __thread_clear_has_fpu(tsk); - stts(); -} - -static inline void __thread_fpu_begin(struct task_struct *tsk) -{ - clts(); - __thread_set_has_fpu(tsk); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -/* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. - * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. - */ -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == percpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) -{ - fpu_switch_t fpu; - - fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; - if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new->fpu_counter++; - __thread_set_has_fpu(new); - prefetch(new->thread.fpu.state); - } else - stts(); - } else { - old->fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; - if (fpu.preload) { - new->fpu_counter++; - if (fpu_lazy_restore(new, cpu)) - fpu.preload = 0; - else - prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) -{ - if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) - __thread_fpu_end(new); - } -} - -/* - * Signal frame handlers... - */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); - -static inline void __clear_fpu(struct task_struct *tsk) -{ - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } -} +extern void math_state_restore(void); extern bool irq_fpu_usable(void); extern void kernel_fpu_begin(void); @@ -463,118 +67,14 @@ static inline void irq_ts_restore(int TS_state) * we can just assume we have FPU access - typically * to save the FP state - we'll just take a #NM * fault and get the FPU access back. - * - * The actual user_fpu_begin/end() functions - * need to be preemption-safe, though. - * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current); -} - -static inline void user_fpu_end(void) -{ - preempt_disable(); - __thread_fpu_end(current); - preempt_enable(); -} - -static inline void user_fpu_begin(void) -{ - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(current); - preempt_enable(); -} - -/* - * These disable preemption on their own and are safe - */ -static inline void save_init_fpu(struct task_struct *tsk) -{ - WARN_ON_ONCE(!__thread_has_fpu(tsk)); - preempt_disable(); - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - preempt_enable(); + return current->thread.fpu.has_fpu; } extern void unlazy_fpu(struct task_struct *tsk); -static inline void clear_fpu(struct task_struct *tsk) -{ - preempt_disable(); - __clear_fpu(tsk); - preempt_enable(); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu_allocated(fpu)) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} - -static inline void fpu_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - -static inline void fpu_copy(struct fpu *dst, struct fpu *src) -{ - memcpy(dst->state, src->state, xstate_size); -} - -extern void fpu_finit(struct fpu *fpu); - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb71b01ab66e..89620b1725d4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 17b7549c4134..7734bcbb5a3a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_64 @@ -124,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; -void __cpuinit mxcsr_feature_mask_init(void) +static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..c38d84e01022 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,6 +21,7 @@ #include #include #include +#include #include struct kmem_cache *task_xstate_cachep; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..ee32dee7a0a3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #ifdef CONFIG_MATH_EMULATION #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..5bad3c71e48f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b766..78f05e438be5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e2..25edcfc9ba5b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d96744..ec61d4c1b93b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 711091114119..e62728e30b01 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION #include #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad906..246490f643b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current)) + if (user_has_fpu()) clts(); load_gdt(&__get_cpu_var(host_gdt)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..b937b6179d80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -57,6 +57,7 @@ #include #include #include +#include /* Ugh! */ #include #include #include diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb4..4889655ba784 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include /* pcntxt_mask */ #ifdef CONFIG_X86_32 static struct saved_context saved_context; -- cgit v1.2.3 From b0e5c77903fd717cc5eb02b7b8f5de3c869efc49 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 6 Feb 2012 18:32:20 -0800 Subject: x86/tsc: Reduce the TSC sync check time for core-siblings For each logical CPU that is coming online, we spend 20msec for checking the TSC synchronization. And as this is done sequentially for each logical CPU boot, this time gets added up depending on the number of logical CPU's supported by the platform. Minimize this by using the socket topology information. If the target CPU coming online doesn't have any of its core-siblings online, a timeout of 20msec will be used for the TSC-warp measurement loop. Otherwise a smaller timeout of 2msec will be used, as we have some information about this socket already (and this information grows as we have more and more logical-siblings in that socket). Ideally we should be able to skip the TSC sync check on the other core-siblings, if the first logical CPU in a socket passed the sync test. But as the TSC is per-logical CPU and can potentially be modified wrongly by the bios before the OS boot, TSC sync test for smaller duration should be able to catch such errors. Also this will catch the condition where all the cores in the socket doesn't get reset at the same time. For example, with this modification, time spent in TSC sync checks on a 4 socket 10-core with HT system gets reduced from 1580msec to 212msec. Signed-off-by: Suresh Siddha Acked-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Jack Steiner Cc: venki@google.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1328581940.29790.20.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9eba29b46cb7..fc25e60a5884 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps; /* * TSC-warp measurement loop running on both CPUs: */ -static __cpuinit void check_tsc_warp(void) +static __cpuinit void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; @@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void) start = get_cycles(); rdtsc_barrier(); /* - * The measurement runs for 20 msecs: + * The measurement runs for 'timeout' msecs: */ - end = start + tsc_khz * 20ULL; + end = start + (cycles_t) tsc_khz * timeout; now = start; for (i = 0; ; i++) { @@ -98,6 +98,25 @@ static __cpuinit void check_tsc_warp(void) now-start, end-start); } +/* + * If the target CPU coming online doesn't have any of its core-siblings + * online, a timeout of 20msec will be used for the TSC-warp measurement + * loop. Otherwise a smaller timeout of 2msec will be used, as we have some + * information about this socket already (and this information grows as we + * have more and more logical-siblings in that socket). + * + * Ideally we should be able to skip the TSC sync check on the other + * core-siblings, if the first logical CPU in a socket passed the sync test. + * But as the TSC is per-logical CPU and can potentially be modified wrongly + * by the bios, TSC sync test for smaller duration should be able + * to catch such errors. Also this will catch the condition where all the + * cores in the socket doesn't get reset at the same time. + */ +static inline unsigned int loop_timeout(int cpu) +{ + return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; +} + /* * Source CPU calls into this - it waits for the freshly booted * target CPU to arrive and then starts the measurement: @@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu) */ atomic_inc(&start_count); - check_tsc_warp(); + check_tsc_warp(loop_timeout(cpu)); while (atomic_read(&stop_count) != cpus-1) cpu_relax(); @@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(); + check_tsc_warp(loop_timeout(smp_processor_id())); /* * Ok, we are done: -- cgit v1.2.3 From 140f190bc3a3b6f200548d204befd998eadd63fd Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 22 Feb 2012 10:06:44 -0800 Subject: x86: Remove some noise from boot log when starting cpus Printing the "start_ip" for every secondary cpu is very noisy on a large system - and doesn't add any value. Drop this message. Console log before: Booting Node 0, Processors #1 smpboot cpu 1: start_ip = 96000 #2 smpboot cpu 2: start_ip = 96000 #3 smpboot cpu 3: start_ip = 96000 #4 smpboot cpu 4: start_ip = 96000 ... #31 smpboot cpu 31: start_ip = 96000 Brought up 32 CPUs Console log after: Booting Node 0, Processors #1 #2 #3 #4 #5 #6 #7 Ok. Booting Node 1, Processors #8 #9 #10 #11 #12 #13 #14 #15 Ok. Booting Node 0, Processors #16 #17 #18 #19 #20 #21 #22 #23 Ok. Booting Node 1, Processors #24 #25 #26 #27 #28 #29 #30 #31 Brought up 32 CPUs Acked-by: Borislav Petkov Signed-off-by: Tony Luck Link: http://lkml.kernel.org/r/4f452eb42507460426@agluck-desktop.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..683575250a65 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -740,8 +740,6 @@ do_rest: * the targeted processor. */ - printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); - atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { -- cgit v1.2.3 From d6126ef5f31ca54980cb067af659a360dfcca037 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 26 Jan 2012 15:49:14 -0800 Subject: x86/mce: Convert static array of pointers to per-cpu variables When I previously fixed up the mce_device code, I used a static array of the pointers. It was (rightfully) pointed out to me that I should be using the per_cpu code instead. This patch converts the code over to that structure, moving the variable back into the per_cpu area, like it used to be for 3.2 and earlier. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Srivatsa S. Bhat Link: https://lkml.org/lkml/2012/1/27/165 Signed-off-by: Tony Luck --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6aefb14cbbc5..441520e4174f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -extern struct device *mce_device[CONFIG_NR_CPUS]; +DECLARE_PER_CPU(struct device *, mce_device); /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..4979a5dfeba2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2038,7 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); - mce_device[cpu] = dev; + per_cpu(mce_device, cpu) = dev; return 0; error2: @@ -2055,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2069,7 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); - mce_device[cpu] = NULL; + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..a4bf9d23cdba 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -585,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) @@ -665,7 +665,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&mce_device[cpu]->kobj, name); + dev = per_cpu(mce_device, cpu); + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -677,7 +678,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; -- cgit v1.2.3 From fadd85f16a8ec3fee8af599e79a209682dc52348 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 23 Jan 2012 15:54:52 -0500 Subject: x86/mce: Fix return value of mce_chrdev_read() when erst is disabled Current kernel MCE code reads ERST at the first reading of /dev/mcelog (maybe in starting mcelogd,) even if the system does not support ERST, which results in a fake "no such device" message (as described in [1].) This problem is not critical, but can confuse system admins. This patch fixes it by filtering the return value from lower (ACPI) layer. [1] http://thread.gmane.org/gmane.linux.kernel/1060250 Reported by: Jon Masters Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Huang Ying Link: https://lkml.org/lkml/2012/1/23/299 Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4979a5dfeba2..87c56ba8080c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1541,6 +1541,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; -- cgit v1.2.3 From b4e518547da042fdc65bd4bdafd046fed13337d5 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 16 Dec 2011 15:50:17 -0700 Subject: irq_domain/x86: Convert x86 (embedded) to use common irq_domain This patch removes the x86-specific definition of irq_domain and replaces it with the common implementation. Signed-off-by: Grant Likely Acked-by: Sebastian Andrzej Siewior Cc: Rob Herring Cc: Thomas Gleixner --- arch/x86/Kconfig | 2 + arch/x86/include/asm/irq_controller.h | 12 ---- arch/x86/include/asm/prom.h | 10 ---- arch/x86/kernel/devicetree.c | 101 ++++++++++------------------------ drivers/net/phy/mdio-gpio.c | 4 +- 5 files changed, 34 insertions(+), 95 deletions(-) delete mode 100644 arch/x86/include/asm/irq_controller.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..e0829a6a4660 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -398,6 +398,7 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE + select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -2076,6 +2077,7 @@ config OLPC select GPIOLIB select OF select OF_PROMTREE + select IRQ_DOMAIN ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h deleted file mode 100644 index 423bbbddf36d..000000000000 --- a/arch/x86/include/asm/irq_controller.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __IRQ_CONTROLLER__ -#define __IRQ_CONTROLLER__ - -struct irq_domain { - int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type); - void *priv; - struct device_node *controller; - struct list_head l; -}; - -#endif diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 644dd885f05a..60bef663609a 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -21,7 +21,6 @@ #include #include #include -#include #ifdef CONFIG_OF extern int of_ioapic; @@ -43,15 +42,6 @@ extern char cmd_line[COMMAND_LINE_SIZE]; #define pci_address_to_pio pci_address_to_pio unsigned long pci_address_to_pio(phys_addr_t addr); -/** - * irq_dispose_mapping - Unmap an interrupt - * @virq: linux virq number of the interrupt to unmap - * - * FIXME: We really should implement proper virq handling like power, - * but that's going to be major surgery. - */ -static inline void irq_dispose_mapping(unsigned int virq) { } - #define HAVE_ARCH_DEVTREE_FIXUPS #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 52821799a702..3ae2ced4a874 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -17,64 +18,14 @@ #include #include -#include #include #include __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; -static LIST_HEAD(irq_domains); -static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; -#ifdef CONFIG_X86_IO_APIC -static void add_interrupt_host(struct irq_domain *ih) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_add(&ih->l, &irq_domains); - raw_spin_unlock_irqrestore(&big_irq_lock, flags); -} -#endif - -static struct irq_domain *get_ih_from_node(struct device_node *controller) -{ - struct irq_domain *ih, *found = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_for_each_entry(ih, &irq_domains, l) { - if (ih->controller == controller) { - found = ih; - break; - } - } - raw_spin_unlock_irqrestore(&big_irq_lock, flags); - return found; -} - -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *ih; - u32 virq, type; - int ret; - - ih = get_ih_from_node(controller); - if (!ih) - return 0; - ret = ih->xlate(ih, intspec, intsize, &virq, &type); - if (ret) - return 0; - if (type == IRQ_TYPE_NONE) - return virq; - irq_set_irq_type(virq, type); - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - unsigned long pci_address_to_pio(phys_addr_t address) { /* @@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type) +static int ioapic_xlate(struct irq_domain *domain, + struct device_node *controller, + const u32 *intspec, u32 intsize, + irq_hw_number_t *out_hwirq, u32 *out_type) { - struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx, type; + u32 line, idx; + int rc; - if (intsize < 2) + if (WARN_ON(intsize < 2)) return -EINVAL; - line = *intspec; - idx = (u32) id->priv; - gsi_cfg = mp_ioapic_gsi_routing(idx); - *out_hwirq = line + gsi_cfg->gsi_base; - - intspec++; - type = *intspec; + line = intspec[0]; - if (type >= ARRAY_SIZE(of_ioapic_type)) + if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = of_ioapic_type + type; - *out_type = it->out_type; + it = &of_ioapic_type[intspec[1]]; + idx = (u32) domain->host_data; set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); + rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), + cpu_to_node(0), &attr); + if (rc) + return rc; + + *out_hwirq = line; + *out_type = it->out_type; + return 0; } +const struct irq_domain_ops ioapic_irq_domain_ops = { + .xlate = ioapic_xlate, +}; + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + + gsi_cfg = mp_ioapic_gsi_routing(i); - id = kzalloc(sizeof(*id), GFP_KERNEL); + id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, + &ioapic_irq_domain_ops, + (void*)i); BUG_ON(!id); - id->controller = np; - id->xlate = ioapic_xlate; - id->priv = (void *)i; - add_interrupt_host(id); return; } } diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 50e8e5e74465..7189adf54bd1 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -255,13 +255,13 @@ static inline int __init mdio_ofgpio_init(void) return platform_driver_register(&mdio_ofgpio_driver); } -static inline void __exit mdio_ofgpio_exit(void) +static inline void mdio_ofgpio_exit(void) { platform_driver_unregister(&mdio_ofgpio_driver); } #else static inline int __init mdio_ofgpio_init(void) { return 0; } -static inline void __exit mdio_ofgpio_exit(void) { } +static inline void mdio_ofgpio_exit(void) { } #endif /* CONFIG_OF_GPIO */ static struct platform_driver mdio_gpio_driver = { -- cgit v1.2.3 From 1adbfa3511ee1c1118e16a9a0246870f12fef4e6 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:29 -0800 Subject: x86, efi: Allow basic init with mixed 32/64-bit efi/kernel Traditionally the kernel has refused to setup EFI at all if there's been a mismatch in 32/64-bit mode between EFI and the kernel. On some platforms that boot natively through EFI (Chrome OS being one), we still need to get at least some of the static data such as memory configuration out of EFI. Runtime services aren't as critical, and it's a significant amount of work to implement switching between the operating modes to call between kernel and firmware for thise cases. So I'm ignoring it for now. v5: * Fixed some printk strings based on feedback * Renamed 32/64-bit specific types to not have _ prefix * Fixed bug in printout of efi runtime disablement v4: * Some of the earlier cleanup was accidentally reverted by this patch, fixed. * Reworded some messages to not have to line wrap printk strings v3: * Reorganized to a series of patches to make it easier to review, and do some of the cleanups I had left out before. v2: * Added graceful error handling for 32-bit kernel that gets passed EFI data above 4GB. * Removed some warnings that were missed in first version. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-6-git-send-email-olof@lixom.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 2 +- arch/x86/kernel/setup.c | 10 ++- arch/x86/platform/efi/efi.c | 164 ++++++++++++++++++++++++++++++++++++++------ include/linux/efi.h | 45 ++++++++++++ 4 files changed, 196 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 844f735fd63a..c9dcc181d4d1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -95,7 +95,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern void efi_memblock_x86_reserve_range(void); +extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe874..88638883176a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI_LOADER_SIGNATURE, 4)) { + "EL32", 4)) { efi_enabled = 1; - efi_memblock_x86_reserve_range(); + efi_64bit = false; + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) { + efi_enabled = 1; + efi_64bit = true; } + if (efi_enabled && efi_memblock_x86_reserve_range()) + efi_enabled = 0; #endif x86_init.oem.arch_setup(); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5a053e7737b7..92660edaa1e7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -68,6 +68,9 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; +bool efi_64bit; +static bool efi_native; + static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -339,11 +342,16 @@ static void __init do_add_efi_memmap(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } -void __init efi_memblock_x86_reserve_range(void) +int __init efi_memblock_x86_reserve_range(void) { unsigned long pmap; #ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (boot_params.efi_info.efi_memmap_hi) { + pr_err("Memory map is above 4GB, disabling EFI.\n"); + return -EINVAL; + } pmap = boot_params.efi_info.efi_memmap; #else pmap = (boot_params.efi_info.efi_memmap | @@ -355,6 +363,8 @@ void __init efi_memblock_x86_reserve_range(void) memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + + return 0; } #if EFI_DEBUG @@ -432,14 +442,75 @@ static void __init efi_free_boot_services(void) static int __init efi_systab_init(void *phys) { - efi.systab = early_ioremap((unsigned long)efi_phys.systab, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_err("Couldn't map the system table!\n"); - return -ENOMEM; + if (efi_64bit) { + efi_system_table_64_t *systab64; + u64 tmp = 0; + + systab64 = early_ioremap((unsigned long)phys, + sizeof(*systab64)); + if (systab64 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab64->hdr; + efi_systab.fw_vendor = systab64->fw_vendor; + tmp |= systab64->fw_vendor; + efi_systab.fw_revision = systab64->fw_revision; + efi_systab.con_in_handle = systab64->con_in_handle; + tmp |= systab64->con_in_handle; + efi_systab.con_in = systab64->con_in; + tmp |= systab64->con_in; + efi_systab.con_out_handle = systab64->con_out_handle; + tmp |= systab64->con_out_handle; + efi_systab.con_out = systab64->con_out; + tmp |= systab64->con_out; + efi_systab.stderr_handle = systab64->stderr_handle; + tmp |= systab64->stderr_handle; + efi_systab.stderr = systab64->stderr; + tmp |= systab64->stderr; + efi_systab.runtime = (void *)(unsigned long)systab64->runtime; + tmp |= systab64->runtime; + efi_systab.boottime = (void *)(unsigned long)systab64->boottime; + tmp |= systab64->boottime; + efi_systab.nr_tables = systab64->nr_tables; + efi_systab.tables = systab64->tables; + tmp |= systab64->tables; + + early_iounmap(systab64, sizeof(*systab64)); +#ifdef CONFIG_X86_32 + if (tmp >> 32) { + pr_err("EFI data located above 4GB, disabling EFI.\n"); + return -EINVAL; + } +#endif + } else { + efi_system_table_32_t *systab32; + + systab32 = early_ioremap((unsigned long)phys, + sizeof(*systab32)); + if (systab32 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab32->hdr; + efi_systab.fw_vendor = systab32->fw_vendor; + efi_systab.fw_revision = systab32->fw_revision; + efi_systab.con_in_handle = systab32->con_in_handle; + efi_systab.con_in = systab32->con_in; + efi_systab.con_out_handle = systab32->con_out_handle; + efi_systab.con_out = systab32->con_out; + efi_systab.stderr_handle = systab32->stderr_handle; + efi_systab.stderr = systab32->stderr; + efi_systab.runtime = (void *)(unsigned long)systab32->runtime; + efi_systab.boottime = (void *)(unsigned long)systab32->boottime; + efi_systab.nr_tables = systab32->nr_tables; + efi_systab.tables = systab32->tables; + + early_iounmap(systab32, sizeof(*systab32)); } - memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); - early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; /* @@ -460,24 +531,47 @@ static int __init efi_systab_init(void *phys) static int __init efi_config_init(u64 tables, int nr_tables) { - efi_config_table_t *config_tables; - int i, sz = sizeof(efi_config_table_t); + void *config_tables, *tablep; + int i, sz; + + if (efi_64bit) + sz = sizeof(efi_config_table_64_t); + else + sz = sizeof(efi_config_table_32_t); /* * Let's see what config tables the firmware passed to us. */ - config_tables = early_ioremap(efi.systab->tables, - efi.systab->nr_tables * sz); + config_tables = early_ioremap(tables, nr_tables * sz); if (config_tables == NULL) { pr_err("Could not map Configuration table!\n"); return -ENOMEM; } + tablep = config_tables; pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid = config_tables[i].guid; - unsigned long table = config_tables[i].table; - + efi_guid_t guid; + unsigned long table; + + if (efi_64bit) { + u64 table64; + guid = ((efi_config_table_64_t *)tablep)->guid; + table64 = ((efi_config_table_64_t *)tablep)->table; + table = table64; +#ifdef CONFIG_X86_32 + if (table64 >> 32) { + pr_cont("\n"); + pr_err("Table located above 4GB, disabling EFI.\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sz); + return -EINVAL; + } +#endif + } else { + guid = ((efi_config_table_32_t *)tablep)->guid; + table = ((efi_config_table_32_t *)tablep)->table; + } if (!efi_guidcmp(guid, MPS_TABLE_GUID)) { efi.mps = table; pr_cont(" MPS=0x%lx ", table); @@ -502,10 +596,10 @@ static int __init efi_config_init(u64 tables, int nr_tables) efi.uga = table; pr_cont(" UGA=0x%lx ", table); } + tablep += sz; } pr_cont("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sz); - return 0; } @@ -569,11 +663,19 @@ void __init efi_init(void) void *tmp; #ifdef CONFIG_X86_32 + if (boot_params.efi_info.efi_systab_hi || + boot_params.efi_info.efi_memmap_hi) { + pr_info("Table located above 4GB, disabling EFI.\n"); + efi_enabled = 0; + return; + } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -602,7 +704,14 @@ void __init efi_init(void) return; } - if (efi_runtime_init()) { + /* + * Note: We currently don't support runtime services on an EFI + * that doesn't match the kernel 32/64-bit mode. + */ + + if (!efi_native) + pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); + else if (efi_runtime_init()) { efi_enabled = 0; return; } @@ -611,10 +720,11 @@ void __init efi_init(void) efi_enabled = 0; return; } - #ifdef CONFIG_X86_32 - x86_platform.get_wallclock = efi_get_time; - x86_platform.set_wallclock = efi_set_rtc_mmss; + if (efi_native) { + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; + } #endif #if EFI_DEBUG @@ -672,6 +782,14 @@ void __init efi_enter_virtual_mode(void) efi.systab = NULL; + /* + * We don't do virtual mode, since we don't do runtime services, on + * non-native EFI + */ + + if (!efi_native) + goto out; + /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { u64 prev_size; @@ -787,6 +905,8 @@ void __init efi_enter_virtual_mode(void) efi.query_capsule_caps = virt_efi_query_capsule_caps; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); + +out: early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; kfree(new_memmap); diff --git a/include/linux/efi.h b/include/linux/efi.h index 37c300712e02..47fbf6b3dc77 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -313,6 +313,16 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules, #define EFI_FILE_SYSTEM_GUID \ EFI_GUID( 0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + typedef struct { efi_guid_t guid; unsigned long table; @@ -327,6 +337,40 @@ typedef struct { #define EFI_1_10_SYSTEM_TABLE_REVISION ((1 << 16) | (10)) #define EFI_1_02_SYSTEM_TABLE_REVISION ((1 << 16) | (02)) +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + typedef struct { efi_table_hdr_t hdr; unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ @@ -497,6 +541,7 @@ extern int __init efi_setup_pcdp_console(char *); #ifdef CONFIG_EFI # ifdef CONFIG_X86 extern int efi_enabled; + extern bool efi_64bit; # else # define efi_enabled 1 # endif -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'arch/x86/kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..5b448a74d0f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd7..b149b88ea795 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c55..1b22f6de2932 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca45..4d6d77ed9b9d 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f1..ae098c438f00 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b67..6c32190dc73e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f8..5080d16a832f 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ead..3a16c1483b45 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..c0180fd372d2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..ea7b4fd34676 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..dcde2d9268cd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6b..5e0f8bb89b2b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd2..bf9dcadbb53a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..112c6824476b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..423547ada38a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..b4cd6d8ea150 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..da7ce7f0e566 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d7..495586232aa1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..3a4e5817a2a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9cd..0c2850874254 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..602fb305365f 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b301..e1b7e051332e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3 From 626109130267713cac020515504ec341e47c96f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 14:54:37 +0000 Subject: x86-64: Fix CFI annotations for NMI nesting code The saving and restoring of %rdx wasn't annotated at all, and the jumping over sections where state gets partly restored wasn't handled either. Further, by folding the pushing of the previous frame in repeat_nmi into that which so far was immediately preceding restart_nmi (after moving the restore of %rdx ahead of that, since it doesn't get used anymore when pushing prior frames), annotations of the replicated frame creations can be made consistent too. v2: Fully fold repeat_nmi into the normal code flow (adding a single redundant instruction to the "normal" code path), thus retaining the special protection of all instructions between repeat_nmi and end_repeat_nmi. Link: http://lkml.kernel.org/r/4F478B630200007800074A31@nat28.tlf.novell.com Signed-off-by: Jan Beulich Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 52 +++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1333d9851778..e0eca007dc0d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1530,6 +1530,7 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1554,6 +1555,7 @@ ENTRY(nmi) */ lea 6*8(%rsp), %rdx test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + CFI_REMEMBER_STATE nested_nmi: /* @@ -1585,10 +1587,12 @@ nested_nmi: nested_nmi_out: popq_cfi %rdx + CFI_RESTORE rdx /* No need to check faults here */ INTERRUPT_RETURN + CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1624,6 +1628,10 @@ first_nmi: * NMI may zero out. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ + /* Do not pop rdx, nested NMIs will corrupt it */ + movq (%rsp), %rdx + CFI_RESTORE rdx + /* Set the NMI executing variable on the stack. */ pushq_cfi $1 @@ -1631,14 +1639,31 @@ first_nmi: .rept 5 pushq_cfi 6*8(%rsp) .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 5*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ .rept 5 pushq_cfi 4*8(%rsp) .endr - - /* Do not pop rdx, nested NMIs will corrupt it */ - movq 11*8(%rsp), %rdx + CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: /* * Everything below this point can be preempted by a nested @@ -1646,7 +1671,6 @@ first_nmi: * caused by an exception and nested NMI will start here, and * can still be preempted by another NMI. */ -restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1675,26 +1699,6 @@ nmi_restore: CFI_ENDPROC END(nmi) - /* - * If an NMI hit an iret because of an exception or breakpoint, - * it can lose its NMI context, and a nested NMI may come in. - * In that case, the nested NMI will change the preempted NMI's - * stack to jump to here when it does the final iret. - */ -repeat_nmi: - INTR_FRAME - /* Update the stack variable to say we are still in NMI */ - movq $1, 5*8(%rsp) - - /* copy the saved stack back to copy stack */ - .rept 5 - pushq_cfi 4*8(%rsp) - .endr - - jmp restart_nmi - CFI_ENDPROC -end_repeat_nmi: - ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v1.2.3 From 69466466ce889cd2cbc8cda9ff1c6083f48cc7f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:55:01 +0000 Subject: x86-64: Improve insn scheduling in SAVE_ARGS_IRQ In one case, use an address register that was computed earlier (and with a simpler instruction), thus reducing the risk of a stall. In the second case, eliminate a branch by using a conditional move (as is already done in call_softirq and xen_do_hypervisor_callback). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4788A50200007800074A26@nat28.tlf.novell.com Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc87..211b2e1683f1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -319,7 +319,7 @@ ENDPROC(native_usergs_sysret64) movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS(%rdi) + testl $3, CS-RBP(%rsi) je 1f SWAPGS /* @@ -329,11 +329,10 @@ ENDPROC(native_usergs_sysret64) * moving irq_enter into assembly, which would be too much work) */ 1: incl PER_CPU_VAR(irq_count) - jne 2f - mov PER_CPU_VAR(irq_stack_ptr),%rsp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi -2: /* Store previous stack value */ + /* Store previous stack value */ pushq %rsi CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 0x77 /* DW_OP_breg7 */, 0, \ -- cgit v1.2.3 From 79fb4ad63e8266ffac1f69bbb45a6f86570493e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Feb 2012 15:55:13 -0500 Subject: x86: Fix the NMI nesting comments Some of the comments for the nesting NMI algorithm were stale and had some references to some prototypes that were first tried. I also updated the comments to be a little easier to understand the flow of the code. It definitely needs the documentation. Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e0eca007dc0d..2de3e457bd4b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1624,11 +1624,12 @@ first_nmi: * | pt_regs | * +-------------------------+ * - * The saved RIP is used to fix up the copied RIP that a nested - * NMI may zero out. The original stack frame and the temp storage + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ - /* Do not pop rdx, nested NMIs will corrupt it */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx CFI_RESTORE rdx @@ -1641,6 +1642,8 @@ first_nmi: .endr CFI_DEF_CFA_OFFSET SS+8-RIP + /* Everything up to here is safe from nested NMIs */ + /* * If there was a nested NMI, the first NMI's iret will return * here. But NMIs are still enabled and we can take another @@ -1667,9 +1670,8 @@ end_repeat_nmi: /* * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception. Repeated NMIs - * caused by an exception and nested NMI will start here, and - * can still be preempted by another NMI. + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp -- cgit v1.2.3 From c484b2418b0b5bb7b16f01343330650faee60df2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 23 Feb 2012 23:46:50 -0800 Subject: PCI: Use class for quirk for via_no_dac Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21ea..28e5e06fcba4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); #endif -- cgit v1.2.3 From ce5f7a99df87918b5be4618a9386213a8e9a7146 Mon Sep 17 00:00:00 2001 From: Bobby Powers Date: Sat, 25 Feb 2012 23:25:38 -0500 Subject: x32: Make sure TS_COMPAT is cleared for x32 tasks If a process has a non-x32 ia32 personality and changes to x32, the process would keep its TS_COMPAT flag. x32 uses the presence of the x32 flag on a syscall to determine compat status, so make sure TS_COMPAT is cleared. Signed-off-by: Bobby Powers Link: http://lkml.kernel.org/r/1330230338-25077-1-git-send-email-bobbypowers@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a0701da2bd18..32e04120b2cd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -540,6 +540,9 @@ void set_personality_ia32(bool x32) clear_thread_flag(TIF_IA32); set_thread_flag(TIF_X32); current->personality &= ~READ_IMPLIES_EXEC; + /* is_compat_task() uses the presence of the x32 + syscall bit flag to determine compat status */ + current_thread_info()->status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); -- cgit v1.2.3 From 00194b2e845da29395ad00c13a884d9acb9306b5 Mon Sep 17 00:00:00 2001 From: Bobby Powers Date: Sat, 25 Feb 2012 22:59:34 -0500 Subject: x32: Only clear TIF_X32 flag once Commits bb212724 and d1a797f3 both added a call to clear_thread_flag(TIF_X32) under set_personality_64bit() - only one is needed. Signed-off-by: Bobby Powers Link: http://lkml.kernel.org/r/1330228774-24223-1-git-send-email-bobbypowers@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 32e04120b2cd..a4659739e202 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -510,7 +510,6 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); -- cgit v1.2.3 From 42dfc43ee5999ac64284476ea0ac6c937587cf2b Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Sun, 26 Feb 2012 21:47:55 +0530 Subject: x86_64: Record stack pointer before task execution begins task->thread.usersp is unusable immediately after a binary is exec()'d until it undergoes a context switch cycle. The start_thread() function called during execve() saves the stack pointer into pt_regs and into old_rsp, but fails to record it into task->thread.usersp. Because of this, KSTK_ESP(task) returns an incorrect value for a 64-bit program until the task is switched out and back in since switch_to swaps %rsp values in and out into task->thread.usersp. Signed-off-by: Siddhesh Poyarekar Link: http://lkml.kernel.org/r/1330273075-2949-1-git-send-email-siddhesh.poyarekar@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1fd94bc4279d..eb54dd0fbed6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); -- cgit v1.2.3 From f0ba662a6e06f2fb58201800eff33dcad9246f97 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 12:09:21 +0000 Subject: x86: Properly _init-annotate NMI selftest code After all, this code is being run once at boot only (if configured in at all). Signed-off-by: Jan Beulich Acked-by: Don Zickus Link: http://lkml.kernel.org/r/4F478C010200007800074A3D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 0d01a8ea4e11..2c39dcd510fa 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -20,35 +21,35 @@ #define FAILURE 1 #define TIMEOUT 2 -static int nmi_fail; +static int __initdata nmi_fail; /* check to see if NMI IPIs work on this machine */ -static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; -static int testcase_total; -static int testcase_successes; -static int expected_testcase_failures; -static int unexpected_testcase_failures; -static int unexpected_testcase_unknowns; +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; -static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) { unexpected_testcase_unknowns++; return NMI_HANDLED; } -static void init_nmi_testsuite(void) +static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } -static void cleanup_nmi_testsuite(void) +static void __init cleanup_nmi_testsuite(void) { unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); } -static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) { int cpu = raw_smp_processor_id(); @@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) return NMI_DONE; } -static void test_nmi_ipi(struct cpumask *mask) +static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; @@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask) return; } -static void remote_ipi(void) +static void __init remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); @@ -94,19 +95,19 @@ static void remote_ipi(void) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void local_ipi(void) +static void __init local_ipi(void) { cpumask_clear(to_cpumask(nmi_ipi_mask)); cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void reset_nmi(void) +static void __init reset_nmi(void) { nmi_fail = 0; } -static void dotest(void (*testcase_fn)(void), int expected) +static void __init dotest(void (*testcase_fn)(void), int expected) { testcase_fn(); /* @@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected) reset_nmi(); } -static inline void print_testname(const char *testname) +static inline void __init print_testname(const char *testname) { printk("%12s:", testname); } -void nmi_selftest(void) +void __init nmi_selftest(void) { init_nmi_testsuite(); -- cgit v1.2.3 From d93c4071b78f4676ef70ec8f2d4bae59b6cc5523 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:50:27 +0000 Subject: x86/time: Eliminate unused irq0_irqs counter As of v2.6.38 this counter is being maintained without ever being read. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4787930200007800074A10@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 1 - arch/x86/kernel/time.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index da0b3ca815b7..382f75d735f3 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -7,7 +7,6 @@ typedef struct { unsigned int __softirq_pending; unsigned int __nmi_count; /* arch dependent */ - unsigned int irq0_irqs; #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dd5fbf4101fc..c6eba2b42673 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc); */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ -- cgit v1.2.3 From 928282e432ee584129a39da831ffa72c38e189b7 Mon Sep 17 00:00:00 2001 From: Mark Wielaard Date: Fri, 24 Feb 2012 11:32:05 +0100 Subject: x86-64: Fix CFI data for common_interrupt() Commit eab9e6137f23 ("x86-64: Fix CFI data for interrupt frames") introduced a DW_CFA_def_cfa_expression in the SAVE_ARGS_IRQ macro. To later define the CFA using a simple register+offset rule both register and offset need to be supplied. Just using CFI_DEF_CFA_REGISTER leaves the offset undefined. So use CFI_DEF_CFA with reg+off explicitly at the end of common_interrupt. Signed-off-by: Mark Wielaard Acked-by: Jan Beulich Link: http://lkml.kernel.org/r/1330079527-30711-1-git-send-email-mjw@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..54be36bf2620 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -813,7 +813,7 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA_REGISTER rsi + CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ leaq ARGOFFSET-RBP(%rsi), %rsp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET -- cgit v1.2.3 From f649e9388cd46ad1634164e56f96ae092ca59e4a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 20 Jan 2012 16:24:09 -0500 Subject: x86: relocate get/set debugreg fcns to include/asm/debugreg. Since we already have a debugreg.h header file, move the assoc. get/set functions to it. In addition to it being the logical home for them, it has a secondary advantage. The functions that are moved use BUG(). So we really need to have linux/bug.h in scope. But asm/processor.h is used about 600 times, vs. only about 15 for debugreg.h -- so adding bug.h to the latter reduces the amount of time we'll be processing it during a compile. Signed-off-by: Paul Gortmaker Acked-by: Ingo Molnar CC: Thomas Gleixner CC: "H. Peter Anvin" --- arch/x86/include/asm/debugreg.h | 67 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 63 ------------------------------------- arch/x86/kernel/cpu/common.c | 1 + 3 files changed, 68 insertions(+), 63 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b903d5ea3941..2d91580bf228 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -78,8 +78,75 @@ */ #ifdef __KERNEL__ +#include + DECLARE_PER_CPU(unsigned long, cpu_dr7); +#ifndef CONFIG_PARAVIRT +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) +#endif + +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("mov %0, %%db0" ::"r" (value)); + break; + case 1: + asm("mov %0, %%db1" ::"r" (value)); + break; + case 2: + asm("mov %0, %%db2" ::"r" (value)); + break; + case 3: + asm("mov %0, %%db3" ::"r" (value)); + break; + case 6: + asm("mov %0, %%db6" ::"r" (value)); + break; + case 7: + asm("mov %0, %%db7" ::"r" (value)); + break; + default: + BUG(); + } +} + static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 58545c97d071..30aa6e95f814 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -474,61 +474,6 @@ struct thread_struct { unsigned io_bitmap_max; }; -static inline unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("mov %%db0, %0" :"=r" (val)); - break; - case 1: - asm("mov %%db1, %0" :"=r" (val)); - break; - case 2: - asm("mov %%db2, %0" :"=r" (val)); - break; - case 3: - asm("mov %%db3, %0" :"=r" (val)); - break; - case 6: - asm("mov %%db6, %0" :"=r" (val)); - break; - case 7: - asm("mov %%db7, %0" :"=r" (val)); - break; - default: - BUG(); - } - return val; -} - -static inline void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("mov %0, %%db0" ::"r" (value)); - break; - case 1: - asm("mov %0, %%db1" ::"r" (value)); - break; - case 2: - asm("mov %0, %%db2" ::"r" (value)); - break; - case 3: - asm("mov %0, %%db3" ::"r" (value)); - break; - case 6: - asm("mov %0, %%db6" ::"r" (value)); - break; - case 7: - asm("mov %0, %%db7" ::"r" (value)); - break; - default: - BUG(); - } -} - /* * Set IOPL bits in EFLAGS from given mask */ @@ -574,14 +519,6 @@ static inline void native_swapgs(void) #define __cpuid native_cpuid #define paravirt_enabled() 0 -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = native_get_debugreg(register) -#define set_debugreg(value, register) \ - native_set_debugreg(register, value) - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318f..0d676dd923ac 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 50af5ead3b44ccf8bd2b4d2a50c1b610f557c480 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 20 Jan 2012 18:35:53 -0500 Subject: bug.h: add include of it to various implicit C users With bug.h currently living right in linux/kernel.h there are files that use BUG_ON and friends but are not including the header explicitly. Fix them up so we can remove the presence in kernel.h file. Signed-off-by: Paul Gortmaker --- arch/arm/mach-imx/cpu_op-mx51.c | 1 + arch/arm/mach-ux500/board-mop500-pins.c | 1 + arch/mips/fw/arc/cmdline.c | 1 + arch/mips/fw/arc/identify.c | 1 + arch/parisc/math-emu/fpudispatch.c | 1 + arch/powerpc/kernel/pmc.c | 1 + arch/powerpc/xmon/ppc-opc.c | 1 + arch/powerpc/xmon/spu-opc.c | 1 + arch/x86/kernel/paravirt.c | 1 + arch/x86/mm/kmemcheck/selftest.c | 1 + drivers/gpu/drm/radeon/cayman_blit_shaders.c | 1 + drivers/gpu/drm/radeon/evergreen_blit_shaders.c | 1 + drivers/gpu/drm/radeon/r600_blit_shaders.c | 1 + drivers/staging/wlags49_h2/hcf.c | 1 + lib/atomic64_test.c | 1 + lib/bitmap.c | 1 + lib/iommu-helper.c | 1 + lib/list_debug.c | 1 + lib/plist.c | 1 + lib/string.c | 1 + lib/timerqueue.c | 1 + 21 files changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/arm/mach-imx/cpu_op-mx51.c b/arch/arm/mach-imx/cpu_op-mx51.c index 9d34c3d4c024..7b92cd6da6d3 100644 --- a/arch/arm/mach-imx/cpu_op-mx51.c +++ b/arch/arm/mach-imx/cpu_op-mx51.c @@ -11,6 +11,7 @@ * http://www.gnu.org/copyleft/gpl.html */ +#include #include #include #include diff --git a/arch/arm/mach-ux500/board-mop500-pins.c b/arch/arm/mach-ux500/board-mop500-pins.c index 74bfcff2bdf3..f5413dca532c 100644 --- a/arch/arm/mach-ux500/board-mop500-pins.c +++ b/arch/arm/mach-ux500/board-mop500-pins.c @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/arch/mips/fw/arc/cmdline.c b/arch/mips/fw/arc/cmdline.c index 9fdf07e50f1b..c0122a1dc587 100644 --- a/arch/mips/fw/arc/cmdline.c +++ b/arch/mips/fw/arc/cmdline.c @@ -7,6 +7,7 @@ * * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ +#include #include #include #include diff --git a/arch/mips/fw/arc/identify.c b/arch/mips/fw/arc/identify.c index 788060a53dce..54a33c756f61 100644 --- a/arch/mips/fw/arc/identify.c +++ b/arch/mips/fw/arc/identify.c @@ -11,6 +11,7 @@ * * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ +#include #include #include #include diff --git a/arch/parisc/math-emu/fpudispatch.c b/arch/parisc/math-emu/fpudispatch.c index 6e28f9f4c620..673b73e8420d 100644 --- a/arch/parisc/math-emu/fpudispatch.c +++ b/arch/parisc/math-emu/fpudispatch.c @@ -50,6 +50,7 @@ #define FPUDEBUG 0 #include "float.h" +#include #include #include /* #include */ diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index a841a9d136a2..58eaa3ddf7b9 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -13,6 +13,7 @@ */ #include +#include #include #include diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c index af3780e52e76..6845e91ba04a 100644 --- a/arch/powerpc/xmon/ppc-opc.c +++ b/arch/powerpc/xmon/ppc-opc.c @@ -22,6 +22,7 @@ #include #include +#include #include "nonstdio.h" #include "ppc.h" diff --git a/arch/powerpc/xmon/spu-opc.c b/arch/powerpc/xmon/spu-opc.c index 530df3d6d7b2..7d37597c4bcd 100644 --- a/arch/powerpc/xmon/spu-opc.c +++ b/arch/powerpc/xmon/spu-opc.c @@ -19,6 +19,7 @@ 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ #include +#include #include "spu.h" /* This file holds the Spu opcode table */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..83e7b81d2135 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 036efbea8b28..aef7140c0063 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +#include #include #include "opcode.h" diff --git a/drivers/gpu/drm/radeon/cayman_blit_shaders.c b/drivers/gpu/drm/radeon/cayman_blit_shaders.c index 7b4eeb7b4a8c..19a0114d2e3b 100644 --- a/drivers/gpu/drm/radeon/cayman_blit_shaders.c +++ b/drivers/gpu/drm/radeon/cayman_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c index 3a10399e0066..f85c0af115b5 100644 --- a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c +++ b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.c b/drivers/gpu/drm/radeon/r600_blit_shaders.c index 2d1f6c5ee2a7..3af3c6426a6e 100644 --- a/drivers/gpu/drm/radeon/r600_blit_shaders.c +++ b/drivers/gpu/drm/radeon/r600_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/staging/wlags49_h2/hcf.c b/drivers/staging/wlags49_h2/hcf.c index b008773323b3..5957c3a439ac 100644 --- a/drivers/staging/wlags49_h2/hcf.c +++ b/drivers/staging/wlags49_h2/hcf.c @@ -91,6 +91,7 @@ #include "hcf.h" // HCF and MSF common include file #include "hcfdef.h" // HCF specific include file #include "mmd.h" // MoreModularDriver common include file +#include #include #if ! defined offsetof diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 0c33cde2a1e6..cb99b91c3a1d 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -9,6 +9,7 @@ * (at your option) any later version. */ #include +#include #include #include diff --git a/lib/bitmap.c b/lib/bitmap.c index 0d4a127dd9b3..90a683b34075 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index da053313ee5c..8b1ab6222562 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -4,6 +4,7 @@ #include #include +#include int iommu_is_span_boundary(unsigned int index, unsigned int nr, unsigned long shift, diff --git a/lib/list_debug.c b/lib/list_debug.c index 7204e619a4c1..1bf2fe36f813 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -8,6 +8,7 @@ #include #include +#include #include /* diff --git a/lib/plist.c b/lib/plist.c index a0a4da489c22..6ab0e521c48b 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -23,6 +23,7 @@ * information. */ +#include #include #include diff --git a/lib/string.c b/lib/string.c index dc4a86341f91..0573a20df9a6 100644 --- a/lib/string.c +++ b/lib/string.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #ifndef __HAVE_ARCH_STRNICMP diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 191176a43e9a..14c640355eb1 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -22,6 +22,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include -- cgit v1.2.3 From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:33:18 +0100 Subject: sched/rt: Use schedule_preempt_disabled() Coccinelle based conversion. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/process.c | 4 +--- arch/avr32/kernel/process.c | 4 +--- arch/blackfin/kernel/process.c | 4 +--- arch/cris/kernel/process.c | 4 +--- arch/frv/kernel/process.c | 4 +--- arch/h8300/kernel/process.c | 4 +--- arch/ia64/kernel/process.c | 4 +--- arch/m32r/kernel/process.c | 4 +--- arch/m68k/kernel/process_mm.c | 4 +--- arch/m68k/kernel/process_no.c | 4 +--- arch/microblaze/kernel/process.c | 4 +--- arch/mips/kernel/process.c | 4 +--- arch/mn10300/kernel/process.c | 4 +--- arch/parisc/kernel/process.c | 4 +--- arch/powerpc/kernel/idle.c | 8 ++++---- arch/powerpc/platforms/iseries/setup.c | 8 ++------ arch/s390/kernel/process.c | 4 +--- arch/score/kernel/process.c | 4 +--- arch/sh/kernel/idle.c | 4 +--- arch/sparc/kernel/process_32.c | 8 ++------ arch/sparc/kernel/process_64.c | 10 ++++------ arch/tile/kernel/process.c | 4 +--- arch/x86/kernel/process_32.c | 4 +--- arch/x86/kernel/process_64.c | 4 +--- arch/xtensa/kernel/process.c | 4 +--- init/main.c | 5 +---- kernel/mutex.c | 4 +--- kernel/softirq.c | 4 +--- 28 files changed, 36 insertions(+), 95 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 971d65c253a9..c2ae3cd331fe 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -239,9 +239,7 @@ void cpu_idle(void) leds_event(led_idle_end); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ea3395750324..92c5af98a6f7 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -40,9 +40,7 @@ void cpu_idle(void) cpu_idle_sleep(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 8dd0416673cb..a80a643f3691 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -94,9 +94,7 @@ void cpu_idle(void) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index aa585e4e979e..d8f50ff6fadd 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -115,9 +115,7 @@ void cpu_idle (void) idle = default_idle; idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 3901df1213c0..29cc49783787 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 933bd388efb2..1a173b35f475 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -81,9 +81,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6d33c5cc94f0..9dc52b63fc87 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -330,9 +330,7 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 422bea9f1dbc..3a4a32b27208 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -90,9 +90,7 @@ void cpu_idle (void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_mm.c b/arch/m68k/kernel/process_mm.c index 099283ee1a8f..fe4186b5fc32 100644 --- a/arch/m68k/kernel/process_mm.c +++ b/arch/m68k/kernel/process_mm.c @@ -78,9 +78,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_no.c b/arch/m68k/kernel/process_no.c index 5e1078cabe0e..f7fe6c348595 100644 --- a/arch/m68k/kernel/process_no.c +++ b/arch/m68k/kernel/process_no.c @@ -73,9 +73,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 7dcb5bfffb75..9155f7d92669 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -110,9 +110,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 7955409051c4..61f1cb45a1d5 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -80,9 +80,7 @@ void __noreturn cpu_idle(void) #endif rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 28eec3102535..cac401d37f75 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -123,9 +123,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 62c60b87d039..d4b94b395c16 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -71,9 +71,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 0a48bf5db6c8..65035141552b 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -101,11 +101,11 @@ void cpu_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - if (cpu_should_die()) + if (cpu_should_die()) { + preempt_enable_no_resched(); cpu_die(); - schedule(); - preempt_disable(); + } + schedule_preempt_disabled(); } } diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 8fc62586a973..a5fbf4cb6329 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -584,9 +584,7 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } @@ -615,9 +613,7 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e795933eb2cb..7618085b4164 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -97,9 +97,7 @@ void cpu_idle(void) tick_nohz_idle_exit(); if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index 25d08030a883..2707023c7563 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -53,9 +53,7 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 406508d4ce74..7e4892826563 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -114,9 +114,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f793742eec2b..935fdbcd88c2 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } @@ -138,9 +136,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 39d8b05201a2..ab9a29268213 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -104,15 +104,13 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) { + preempt_enable_no_resched(); cpu_play_dead(); + } #endif - - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 4c1ac6e5347a..6ae495ef2b99 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -108,9 +108,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..49888fefe794 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -119,9 +119,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..e34257c70c28 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -156,9 +156,7 @@ void cpu_idle(void) } tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 47041e7c088c..2c9004770c4e 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/init/main.c b/init/main.c index ff49a6dacfbb..4990f7ec776a 100644 --- a/init/main.c +++ b/init/main.c @@ -374,11 +374,8 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); - + schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ - preempt_disable(); cpu_idle(); } diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..79b524767a24 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From b263b31e8ad65cdbfa5a7f739460f350554a2dc1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 27 Feb 2012 15:15:25 -0800 Subject: x86, mtrr: Use explicit sizing and padding for the 64-bit ioctls Specify the data structures for the 64-bit ioctls with explicit sizing and padding so that the x32 kernel will correctly use the 64-bit forms of these ioctls. Note that these ioctls are bogus in both forms on both 32 and 64 bits; even on 64 bits the maximum MTRR size is only 44 bits long. Note that nothing really is supposed to use these ioctls and that the preferred interface is text strings on /proc/mtrr, or better yet, nothing at all (use /sys/bus/pci/devices/*/resource*_wc for write combining; that uses PAT not MTRRs.) Signed-off-by: H. Peter Anvin Cc: H. J. Lu Tested-by: Nitin A. Kamble Link: http://lkml.kernel.org/n/tip-vwvnlu3hjmtkwvij4qxtm90l@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 28 ++++++++++++++++++---------- arch/x86/kernel/cpu/mtrr/if.c | 10 ++++++---- 2 files changed, 24 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 4365ffdb461f..7e3f17f92c66 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -29,18 +29,18 @@ #define MTRR_IOCTL_BASE 'M' -struct mtrr_sentry { - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int type; /* Type of region */ -}; - /* Warning: this structure has a different order from i386 on x86-64. The 32bit emulation code takes care of that. But you need to use this for 64bit, otherwise your X server will break. */ #ifdef __i386__ +struct mtrr_sentry { + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + struct mtrr_gentry { unsigned int regnum; /* Register number */ unsigned long base; /* Base address */ @@ -50,12 +50,20 @@ struct mtrr_gentry { #else /* __i386__ */ +struct mtrr_sentry { + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 type; /* Type of region */ +}; + struct mtrr_gentry { - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int regnum; /* Register number */ - unsigned int type; /* Type of region */ + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 regnum; /* Register number */ + __u32 type; /* Type of region */ + __u32 _pad; /* Unused */ }; + #endif /* !__i386__ */ struct mtrr_var_range { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 79289632cb27..a041e094b8b9 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long base; unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; @@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; + gentry.base = base << PAGE_SHIFT; gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that would overflow */ if (size != (__typeof__(gentry.size))size) gentry.base = gentry.size = gentry.type = 0; else { + gentry.base = base; gentry.size = size; gentry.type = type; } -- cgit v1.2.3 From df156f90a0f90649dd38b7667901ef85478f3d2b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 7 Feb 2012 15:52:44 +0100 Subject: x86: Introduce x86_cpuinit.early_percpu_clock_init hook When kvm guest uses kvmclock, it may hang on vcpu hot-plug. This is caused by an overflow in pvclock_get_nsec_offset, u64 delta = tsc - shadow->tsc_timestamp; which in turn is caused by an undefined values from percpu hv_clock that hasn't been initialized yet. Uninitialized clock on being booted cpu is accessed from start_secondary -> smp_callin -> smp_store_cpu_info -> identify_secondary_cpu -> mtrr_ap_init -> mtrr_restore -> stop_machine_from_inactive_cpu -> queue_stop_cpus_work ... -> sched_clock -> kvm_clock_read which is well before x86_cpuinit.setup_percpu_clockev call in start_secondary, where percpu clock is initialized. This patch introduces a hook that allows to setup/initialize per_cpu clock early and avoid overflow due to reading - undefined values - old values if cpu was offlined and then onlined again Another possible early user of this clock source is ftrace that accesses it to get timestamps for ring buffer entries. So if mtrr_ap_init is moved from identify_secondary_cpu to past x86_cpuinit.setup_percpu_clockev in start_secondary, ftrace may cause the same overflow/hang on cpu hot-plug anyway. More complete description of the problem: https://lkml.org/lkml/2012/2/2/101 Credits to Marcelo Tosatti for hook idea. Acked-by: Thomas Gleixner Signed-off-by: Igor Mammedov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/kvmclock.c | 4 +--- arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 517d4767ffdd..5d0afac2962c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b29..ca4e735adc54 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -144,8 +144,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,7 +192,7 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif machine_ops.shutdown = kvm_shutdown; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..a05d6fd5e06d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc673..6f2ec53deed0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .fixup_cpu_id = x86_default_fixup_cpu_id, }; -- cgit v1.2.3 From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:51 +0100 Subject: perf: Add generic taken branch sampling support This patch adds the ability to sample taken branches to the perf_event interface. The ability to capture taken branches is very useful for all sorts of analysis. For instance, basic block profiling, call counts, statistical call graph. This new capability requires hardware assist and as such may not be available on all HW platforms. On Intel x86 it is implemented on top of the Last Branch Record (LBR) facility. To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK bit must be set in attr->sample_type. Sampled taken branches may be filtered by type and/or priv levels. The patch adds a new field, called branch_sample_type, to the perf_event_attr structure. It contains a bitmask of filters to apply to the sampled taken branches. Filters may be implemented in HW. If the HW filter does not exist or is not good enough, some arch may also implement a SW filter. The following generic filters are currently defined: - PERF_SAMPLE_USER only branches whose targets are at the user level - PERF_SAMPLE_KERNEL only branches whose targets are at the kernel level - PERF_SAMPLE_HV only branches whose targets are at the hypervisor level - PERF_SAMPLE_ANY any type of branches (subject to priv levels filters) - PERF_SAMPLE_ANY_CALL any call branches (may incl. syscall on some arch) - PERF_SAMPLE_ANY_RET any return branches (may incl. syscall returns on some arch) - PERF_SAMPLE_IND_CALL indirect call branches Obviously filter may be combined. The priv level bits are optional. If not provided, the priv level of the associated event are used. It is possible to collect branches at a priv level different from the associated event. Use of kernel, hv priv levels is subject to permissions and availability (hv). The number of taken branch records present in each sample may vary based on HW, the type of sampled branches, the executed code. Therefore each sample contains the number of taken branches it contains. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 +++++---- include/linux/perf_event.h | 71 ++++++++++++++++++++++++++++-- kernel/events/core.c | 68 ++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 47a7e63bfe54..309d0cc69163 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -142,9 +142,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].flags = 0; + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } @@ -165,19 +167,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, flags = 0; + u64 from, to, mis = 0, pred = 0; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); if (lbr_format == LBR_FORMAT_EIP_FLAGS) { - flags = !!(from & LBR_FROM_FLAG_MISPRED); + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; from = (u64)((((s64)from) << 1) >> 1); } - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].flags = flags; + cpuc->lbr_entries[i].from = from; + cpuc->lbr_entries[i].to = to; + cpuc->lbr_entries[i].mispred = mis; + cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64426b71381f..5fc494f4a094 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,10 +129,39 @@ enum perf_event_sample_format { PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ }; +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + + PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ +}; + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -240,6 +269,7 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; + __u64 branch_sample_type; /* enum branch_sample_type */ }; /* @@ -458,6 +488,8 @@ enum perf_event_type { * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK * }; */ PERF_RECORD_SAMPLE = 9, @@ -530,12 +562,34 @@ struct perf_raw_record { void *data; }; +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + */ struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 flags; + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + reserved:62; }; +/* + * branch stack layout: + * nr: number of taken branches stored in entries[] + * + * Note that nr can vary from sample to sample + * branches (to, from) are stored from most recent + * to least recent, i.e., entries[0] contains the most + * recent branch. + */ struct perf_branch_stack { __u64 nr; struct perf_branch_entry entries[0]; @@ -566,7 +620,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + struct hw_perf_event_extra extra_reg; + struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; @@ -1007,12 +1063,14 @@ struct perf_sample_data { u64 period; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; + struct perf_branch_stack *br_stack; }; static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; + data->br_stack = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/kernel/events/core.c b/kernel/events/core.c index e8b32ac75ce3..5820efdf47cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -3907,6 +3914,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3949,6 +3974,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -5935,6 +5969,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; -- cgit v1.2.3 From 225ce53910edc3c2322b1e4f2ed049a9196cd0b3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:52 +0100 Subject: perf/x86: Add Intel LBR MSR definitions This patch adds the LBR definitions for NHM/WSM/SNB and Core. It also adds the definitions for the architected LBR MSR: LBR_SELECT, LBRT_TOS. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a6962d9161a0..ccb805966f68 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -56,6 +56,13 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 309d0cc69163..6710a5116ebd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -203,23 +203,23 @@ void intel_pmu_lbr_read(void) void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x680; - x86_pmu.lbr_to = 0x6c0; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; } void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } -- cgit v1.2.3 From b36817e8863090f1f24e538106ca50fa1d9e4003 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:53 +0100 Subject: perf/x86: Add Intel LBR sharing logic The Intel LBR on some recent processor is capable of filtering branches by type. The filter is configurable via the LBR_SELECT MSR register. There are limitation on how this register can be used. On Nehalem/Westmere, the LBR_SELECT is shared by the two HT threads when HT is on. It is private to each core when HT is off. On SandyBridge, the LBR_SELECT register is private to each thread when HT is on. It is private to each core when HT is off. The kernel must manage the sharing of LBR_SELECT. It allows multiple users on the same logical CPU to use LBR_SELECT as long as they program it with the same value. Across sibling CPUs (HT threads), the same restriction applies on NHM/WSM. This patch implements this sharing logic by leveraging the mechanism put in place for managing the offcore_response shared MSR. We modify __intel_shared_reg_get_constraints() to cause x86_get_event_constraint() to be called because LBR may be associated with events that may be counter constrained. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++ arch/x86/kernel/cpu/perf_event.h | 4 ++ arch/x86/kernel/cpu/perf_event_intel.c | 70 +++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f8bddb5b0600..377931354ac7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -426,6 +426,10 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; + /* mark not used */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 82db83b5c3bc..9b9c580a7ab8 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -33,6 +33,7 @@ enum extra_reg_type { EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -130,6 +131,7 @@ struct cpu_hw_events { void *lbr_context; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; /* * Intel host/guest exclude bits @@ -342,6 +344,8 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ /* * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..97f7bb587519 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1123,17 +1123,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) + struct perf_event *event, + struct hw_perf_event_extra *reg) { struct event_constraint *c = &emptyconstraint; - struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) - return &unconstrained; + return NULL; /* call x86_get_event_constraint() */ again: era = &cpuc->shared_regs->regs[reg->idx]; @@ -1156,14 +1156,10 @@ again: reg->alloc = 1; /* - * All events using extra_reg are unconstrained. - * Avoids calling x86_get_event_constraints() - * - * Must revisit if extra_reg controlling events - * ever have constraints. Worst case we go through - * the regular event constraint table. + * need to call x86_get_event_constraint() + * to check if associated event has constraints */ - c = &unconstrained; + c = NULL; } else if (intel_try_alt_er(event, orig_idx)) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -1200,11 +1196,23 @@ static struct event_constraint * intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = NULL; - - if (event->hw.extra_reg.idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, event); - + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } return c; } @@ -1252,6 +1260,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, reg = &event->hw.extra_reg; if (reg->idx != EXTRA_REG_NONE) __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); } static void intel_put_event_constraints(struct cpu_hw_events *cpuc, @@ -1431,7 +1443,7 @@ static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!x86_pmu.extra_regs) + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) return NOTIFY_OK; cpuc->shared_regs = allocate_shared_regs(cpu); @@ -1453,22 +1465,28 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) return; - for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_shared_regs *pc; + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; - pc = per_cpu(cpu_hw_events, i).shared_regs; - if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online = cpuc->shared_regs; - cpuc->shared_regs = pc; - break; + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } - cpuc->shared_regs->core_id = core_id; - cpuc->shared_regs->refcnt++; + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; } static void intel_pmu_cpu_dying(int cpu) -- cgit v1.2.3 From ff3fb511ba377e8a0a7f553cc352237f70d08121 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:54 +0100 Subject: perf/x86: Sync branch stack sampling with precise_sampling If precise sampling is enabled on Intel x86 then perf_event uses PEBS. To correct for the off-by-one error of PEBS, perf_event uses LBR when precise_sample > 1. On Intel x86 PERF_SAMPLE_BRANCH_STACK is implemented using LBR, therefore both features must be coordinated as they may not configure LBR the same way. For PEBS, LBR needs to capture all branches at the priv level of the associated event. This patch checks that the branch type and priv level of BRANCH_STACK is compatible with that of the PEBS LBR requirement, thereby allowing: $ perf record -b any,u -e instructions:upp .... But: $ perf record -b any_call,u -e instructions:upp Is not possible. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 377931354ac7..cea567483274 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -353,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event) return 0; } +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { @@ -369,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } } /* -- cgit v1.2.3 From c5cc2cd906ea9fe73e3c93f9ad824996faa278cc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:55 +0100 Subject: perf/x86: Add Intel LBR mappings for PERF_SAMPLE_BRANCH filters This patch adds the mappings from the generic PERF_SAMPLE_BRANCH_* filters to the actual Intel x86LBR filters, whenever they exist. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-6-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 103 ++++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 9b9c580a7ab8..4e948976aefb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -539,6 +539,8 @@ void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_snb(void); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 97f7bb587519..b0db01692441 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1757,7 +1757,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - intel_pmu_lbr_init_nhm(); + intel_pmu_lbr_init_snb(); x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 6710a5116ebd..e54a063b2863 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,6 +13,49 @@ enum { LBR_FORMAT_EIP_FLAGS = 0x03, }; +/* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ +#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT 2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT 5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ +#define LBR_FAR_BIT 8 /* do not capture far branches */ + +#define LBR_KERNEL (1 << LBR_KERNEL_BIT) +#define LBR_USER (1 << LBR_USER_BIT) +#define LBR_JCC (1 << LBR_JCC_BIT) +#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) +#define LBR_RETURN (1 << LBR_RETURN_BIT) +#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) +#define LBR_FAR (1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP -1 /* LBR filter not supported */ +#define LBR_IGN 0 /* ignored */ + +#define LBR_ANY \ + (LBR_JCC |\ + LBR_REL_CALL |\ + LBR_IND_CALL |\ + LBR_RETURN |\ + LBR_REL_JMP |\ + LBR_IND_JMP |\ + LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -151,8 +194,6 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_stack.nr = i; } -#define LBR_FROM_FLAG_MISPRED (1ULL << 63) - /* * Due to lack of segmentation in Linux the effective address (offset) * is the same as the linear address, allowing us to merge the LIP and EIP @@ -200,26 +241,84 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches + */ + [PERF_SAMPLE_BRANCH_ANY_CALL] = + LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include IND_JMP to capture IND_CALL + */ + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, +}; + +/* core */ void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + pr_cont("4-deep LBR, "); } +/* nehalem/westmere */ void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + pr_cont("16-deep LBR, "); } +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + pr_cont("16-deep LBR, "); +} + +/* atom */ void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + pr_cont("8-deep LBR, "); } -- cgit v1.2.3 From 88c9a65e13f393fd60d8b9e9c659a34f9e39967d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:56 +0100 Subject: perf/x86: Disable LBR support for older Intel Atom processors The patch adds a restriction for Intel Atom LBR support. Only steppings 10 (PineView) and more recent are supported. Older models do not have a functional LBR. Their LBR does not freeze on PMU interrupt which makes LBR unusable in the context of perf_events. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-7-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index e54a063b2863..07f0ff88e443 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -315,6 +315,16 @@ void intel_pmu_lbr_init_snb(void) /* atom */ void intel_pmu_lbr_init_atom(void) { + /* + * only models starting at stepping 10 seems + * to have an operational LBR which can freeze + * on PMU interrupt + */ + if (boot_cpu_data.x86_mask < 10) { + pr_cont("LBR disabled due to erratum"); + return; + } + x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; -- cgit v1.2.3 From 60ce0fbd072695866cb27b729690ab59dce705a5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:57 +0100 Subject: perf/x86: Implement PERF_SAMPLE_BRANCH for Intel CPUs This patch implements PERF_SAMPLE_BRANCH support for Intel x86processors. It connects PERF_SAMPLE_BRANCH to the actual LBR. The patch adds the hooks in the PMU irq handler to save the LBR on counter overflow for both regular and PEBS modes. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-8-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_intel.c | 35 ++++++++++++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 10 ++-- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 86 +++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4e948976aefb..ef7419cbd13d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -541,6 +541,8 @@ void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_snb(void); +int intel_pmu_setup_lbr_filter(struct perf_event *event); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b0db01692441..7cc1e2dcc4dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -727,6 +727,19 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) + return true; + + return false; +} + static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -881,6 +894,13 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + /* + * must disable before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_disable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -935,6 +955,12 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_pmu_enable_bts(hwc->config); return; } + /* + * must enabled before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_enable(event); if (event->attr.exclude_host) cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); @@ -1057,6 +1083,9 @@ again: data.period = event->hw.last_period; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1305,6 +1334,12 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.config = alt_config; } + if (intel_pmu_needs_lbr_smpl(event)) { + ret = intel_pmu_setup_lbr_filter(event); + if (ret) + return ret; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index d6bd49faa40c..ee7e3c8d9d6a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,9 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_enable(event); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -454,9 +451,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); hwc->config |= ARCH_PERFMON_EVENTSEL_INT; - - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_disable(event); } void intel_pmu_pebs_enable_all(void) @@ -572,6 +566,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * both formats and we don't use the other fields in this * routine. */ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct pebs_record_core *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; @@ -602,6 +597,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 07f0ff88e443..d0fb864ff2b0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -56,6 +56,10 @@ enum { #define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define for_each_branch_sample_type(x) \ + for ((x) = PERF_SAMPLE_BRANCH_USER; \ + (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -64,6 +68,10 @@ enum { static void __intel_pmu_lbr_enable(void) { u64 debugctl; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_sel) + wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); @@ -119,7 +127,6 @@ void intel_pmu_lbr_enable(struct perf_event *event) * Reset the LBR stack if we changed task context to * avoid data leaks. */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { intel_pmu_lbr_reset(); cpuc->lbr_context = event->ctx; @@ -138,8 +145,11 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - if (cpuc->enabled && !cpuc->lbr_users) + if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); + /* avoid stale pointer */ + cpuc->lbr_context = NULL; + } } void intel_pmu_lbr_enable_all(void) @@ -158,6 +168,9 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } +/* + * TOS = most recently recorded branch + */ static inline u64 intel_pmu_lbr_tos(void) { u64 tos; @@ -241,6 +254,75 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, m; + u64 v; + + for_each_branch_sample_type(m) { + if (!(br_type & m)) + continue; + + v = x86_pmu.lbr_sel_map[m]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + mask |= v; + + if (m == PERF_SAMPLE_BRANCH_ANY) + break; + } + reg = &event->hw.branch_reg; + reg->idx = EXTRA_REG_LBR; + + /* LBR_SELECT operates in suppress mode so invert mask */ + reg->config = ~mask & x86_pmu.lbr_sel_mask; + + return 0; +} + +/* + * all the bits supported on some flavor of x86LBR + * we ignore BRANCH_HV because it is not supported + */ +#define PERF_SAMPLE_BRANCH_X86_ALL \ + (PERF_SAMPLE_BRANCH_ANY |\ + PERF_SAMPLE_BRANCH_USER |\ + PERF_SAMPLE_BRANCH_KERNEL) + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + + /* + * no LBR on this PMU + */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* + * if no LBR HW filter, users can only + * capture all branches + */ + if (!x86_pmu.lbr_sel_map) { + if (br_type != PERF_SAMPLE_BRANCH_X86_ALL) + return -EOPNOTSUPP; + return 0; + } + /* + * we ignore branch priv levels we do not + * know about: BRANCH_HV + */ + + return intel_pmu_setup_hw_lbr_filter(event); +} + /* * Map interface branch filters onto LBR filters */ -- cgit v1.2.3 From 3e702ff6d1ea12dcf1c798ecb61e7f3a1579df42 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:58 +0100 Subject: perf/x86: Add LBR software filter support for Intel CPUs This patch adds an internal sofware filter to complement the (optional) LBR hardware filter. The software filter is necessary: - as a substitute when there is no HW LBR filter (e.g., Atom, Core) - to complement HW LBR filter in case of errata (e.g., Nehalem/Westmere) - to provide finer grain filtering (e.g., all processors) Sometimes the LBR HW filter cannot distinguish between two types of branches. For instance, to capture syscall as CALLS, it is necessary to enable the LBR_FAR filter which will also capture JMP instructions. Thus, a second pass is necessary to filter those out, this is what the SW filter can do. The SW filter is built on top of the internal x86 disassembler. It is a best effort filter especially for user level code. It is subject to the availability of the text page of the program. The SW filter is enabled on all Intel processors. It is bypassed when the user is capturing all branches at all priv levels. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-9-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 10 + arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 +- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 332 +++++++++++++++++++++++++++-- 3 files changed, 321 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ef7419cbd13d..f104c054dc5c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -132,6 +132,7 @@ struct cpu_hw_events { struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; + u64 br_sel; /* * Intel host/guest exclude bits @@ -459,6 +460,15 @@ extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ee7e3c8d9d6a..7f64df19e7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -3,6 +3,7 @@ #include #include +#include #include "perf_event.h" @@ -469,17 +470,6 @@ void intel_pmu_pebs_disable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } -#include - -static inline bool kernel_ip(unsigned long ip) -{ -#ifdef CONFIG_X86_32 - return ip > PAGE_OFFSET; -#else - return (long)ip < 0; -#endif -} - static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d0fb864ff2b0..520b4265fcd2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -3,6 +3,7 @@ #include #include +#include #include "perf_event.h" @@ -60,6 +61,53 @@ enum { for ((x) = PERF_SAMPLE_BRANCH_USER; \ (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -131,6 +179,7 @@ void intel_pmu_lbr_enable(struct perf_event *event) intel_pmu_lbr_reset(); cpuc->lbr_context = event->ctx; } + cpuc->br_sel = event->hw.branch_reg.reg; cpuc->lbr_users++; } @@ -252,6 +301,44 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_32(cpuc); else intel_pmu_lbr_read_64(cpuc); + + intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + int mask = 0; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* we ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + /* + * stash actual user request into reg, it may + * be used by fixup code for some CPU + */ + event->hw.branch_reg.reg = mask; } /* @@ -273,10 +360,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) v = x86_pmu.lbr_sel_map[m]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; - mask |= v; - if (m == PERF_SAMPLE_BRANCH_ANY) - break; + if (v != LBR_IGN) + mask |= v; } reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; @@ -287,18 +373,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) return 0; } -/* - * all the bits supported on some flavor of x86LBR - * we ignore BRANCH_HV because it is not supported - */ -#define PERF_SAMPLE_BRANCH_X86_ALL \ - (PERF_SAMPLE_BRANCH_ANY |\ - PERF_SAMPLE_BRANCH_USER |\ - PERF_SAMPLE_BRANCH_KERNEL) - int intel_pmu_setup_lbr_filter(struct perf_event *event) { - u64 br_type = event->attr.branch_sample_type; + int ret = 0; /* * no LBR on this PMU @@ -307,20 +384,210 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return -EOPNOTSUPP; /* - * if no LBR HW filter, users can only - * capture all branches + * setup SW LBR filter */ - if (!x86_pmu.lbr_sel_map) { - if (br_type != PERF_SAMPLE_BRANCH_X86_ALL) - return -EOPNOTSUPP; - return 0; + intel_pmu_setup_sw_lbr_filter(event); + + /* + * setup HW LBR filter, if any + */ + if (x86_pmu.lbr_sel_map) + ret = intel_pmu_setup_hw_lbr_filter(event); + + return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to) +{ + struct insn insn; + void *addr; + int bytes, size = MAX_INSN_SIZE; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes = copy_from_user_nmi(buf, (void __user *)from, size); + if (bytes != size) + return X86_BR_NONE; + + addr = buf; + } else + addr = (void *)from; + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, addr, is64); + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + insn_get_modrm(&insn); + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; } /* - * we ignore branch priv levels we do not - * know about: BRANCH_HV + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented */ + if (ret != X86_BR_NONE) + ret |= to_plm; - return intel_pmu_setup_hw_lbr_filter(event); + return ret; +} + +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ + u64 from, to; + int br_sel = cpuc->br_sel; + int i, j, type; + bool compress = false; + + /* if sampling all branches, then nothing to filter */ + if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + + type = branch_type(from, to); + + /* if type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; + compress = true; + } + } + + if (!compress) + return; + + /* remove all entries with from=0 */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } } /* @@ -363,6 +630,10 @@ void intel_pmu_lbr_init_core(void) x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ pr_cont("4-deep LBR, "); } @@ -377,6 +648,13 @@ void intel_pmu_lbr_init_nhm(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + /* + * SW branch filter usage: + * - workaround LBR_SEL errata (see above) + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ pr_cont("16-deep LBR, "); } @@ -391,6 +669,12 @@ void intel_pmu_lbr_init_snb(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ pr_cont("16-deep LBR, "); } @@ -412,5 +696,9 @@ void intel_pmu_lbr_init_atom(void) x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ pr_cont("8-deep LBR, "); } -- cgit v1.2.3 From 2481c5fa6db0237e4f0168f88913178b2b495b7c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:59 +0100 Subject: perf: Disable PERF_SAMPLE_BRANCH_* when not supported PERF_SAMPLE_BRANCH_* is disabled for: - SW events (sw counters, tracepoints) - HW breakpoints - ALL but Intel x86 architecture - AMD64 processors Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 4 ++++ arch/arm/kernel/perf_event.c | 4 ++++ arch/mips/kernel/perf_event_mipsxx.c | 4 ++++ arch/powerpc/kernel/perf_event.c | 4 ++++ arch/sh/kernel/perf_event.c | 4 ++++ arch/sparc/kernel/perf_event.c | 4 ++++ arch/x86/kernel/cpu/perf_event_amd.c | 3 +++ kernel/events/core.c | 24 ++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 6 ++++++ 9 files changed, 57 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8143cd7cdbfb..0dae252f7a33 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -685,6 +685,10 @@ static int alpha_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 5bb91bf3d47f..a23c42abc694 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -539,6 +539,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (armpmu->map_event(event) == -ENOENT) return -ENOENT; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index e3b897acfbc0..811084f4e422 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -606,6 +606,10 @@ static int mipspmu_event_init(struct perf_event *event) { int err = 0; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index f04c2301725e..c2e27ede07ec 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1084,6 +1084,10 @@ static int power_pmu_event_init(struct perf_event *event) if (!ppmu) return -ENOENT; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 10b14e3a7eb8..068b8a2759b5 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -310,6 +310,10 @@ static int sh_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HW_CACHE: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 614da624330c..8e16a4a21582 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1105,6 +1105,10 @@ static int sparc_pmu_event_init(struct perf_event *event) if (atomic_read(&nmi_active) < 0) return -ENODEV; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (attr->type) { case PERF_TYPE_HARDWARE: if (attr->config >= sparc_pmu->max_events) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67250a52430b..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 diff --git a/kernel/events/core.c b/kernel/events/core.c index 5820efdf47cd..242bb51c67f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5044,6 +5044,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -5154,6 +5160,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5379,6 +5391,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5453,6 +5471,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3330022a7ac1..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++--- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++ include/linux/perf_event.h | 9 +++- kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cea567483274..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f104c054dc5c..74387c12dc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -324,6 +324,7 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + void (*flush_branch_stack)(void); /* * Intel Arch Perfmon v2+ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7cc1e2dcc4dd..6627089232a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; static __init void intel_clovertown_quirk(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 242bb51c67f2..c61234b1a988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -137,6 +137,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_rotate_start(ctx->pmu); } +/* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event) atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } if (event->rb) { @@ -5924,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; -- cgit v1.2.3 From 55283e2537714f9370c4ab847d170acf223daf90 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 5 Mar 2012 15:32:11 -0800 Subject: x32: Add ptrace for x32 X32 ptrace is a hybrid of 64bit ptrace and compat ptrace with 32bit address and longs. It use 64bit ptrace to access the full 64bit registers. PTRACE_PEEKUSR and PTRACE_POKEUSR are only allowed to access segment and debug registers. PTRACE_PEEKUSR returns the lower 32bits and PTRACE_POKEUSR zero-extends 32bit value to 64bit. It works since the upper 32bits of segment and debug registers of x32 process are always zero. GDB only uses PTRACE_PEEKUSR and PTRACE_POKEUSR to access segment and debug registers. [ hpa: changed TIF_X32 test to use !is_ia32_task() instead, and moved the system call number to the now-unused 521 slot. ] Signed-off-by: "H.J. Lu" Signed-off-by: H. Peter Anvin Cc: Roland McGrath Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/kernel/ptrace.c | 99 ++++++++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscall_64.tbl | 4 +- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b766..93e7877a19c4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1130,6 +1130,100 @@ static int genregs32_set(struct task_struct *target, return ret; } +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + return do_arch_prctl(child, data, addr); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { @@ -1139,6 +1233,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, int ret; __u32 val; +#ifdef CONFIG_X86_X32_ABI + if (!is_ia32_task()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif + switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 0d778b800884..dd29a9ea27c5 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -107,7 +107,7 @@ 98 common getrusage sys_getrusage 99 common sysinfo sys_sysinfo 100 common times sys_times -101 common ptrace sys_ptrace +101 64 ptrace sys_ptrace 102 common getuid sys_getuid 103 common syslog sys_syslog 104 common getgid sys_getgid @@ -331,7 +331,7 @@ 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg 520 x32 execve stub_x32_execve -# 521 available +521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending sys32_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo -- cgit v1.2.3 From 86b4ce3156c0dc140907ad03639564000cde694f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:09 +0900 Subject: x86/kprobes: Fix instruction recovery on optimized path Current probed-instruction recovery expects that only breakpoint instruction modifies instruction. However, since kprobes jump optimization can replace original instructions with a jump, that expectation is not enough. And it may cause instruction decoding failure on the function where an optimized probe already exists. This bug can reproduce easily as below: 1) find a target function address (any kprobe-able function is OK) $ grep __secure_computing /proc/kallsyms ffffffff810c19d0 T __secure_computing 2) decode the function $ objdump -d vmlinux --start-address=0xffffffff810c19d0 --stop-address=0xffffffff810c19eb vmlinux: file format elf64-x86-64 Disassembly of section .text: ffffffff810c19d0 <__secure_computing>: ffffffff810c19d0: 55 push %rbp ffffffff810c19d1: 48 89 e5 mov %rsp,%rbp ffffffff810c19d4: e8 67 8f 72 00 callq ffffffff817ea940 ffffffff810c19d9: 65 48 8b 04 25 40 b8 mov %gs:0xb840,%rax ffffffff810c19e0: 00 00 ffffffff810c19e2: 83 b8 88 05 00 00 01 cmpl $0x1,0x588(%rax) ffffffff810c19e9: 74 05 je ffffffff810c19f0 <__secure_computing+0x20> 3) put a kprobe-event at an optimize-able place, where no call/jump places within the 5 bytes. $ su - # cd /sys/kernel/debug/tracing # echo p __secure_computing+0x9 > kprobe_events 4) enable it and check it is optimized. # echo 1 > events/kprobes/p___secure_computing_9/enable # cat ../kprobes/list ffffffff810c19d9 k __secure_computing+0x9 [OPTIMIZED] 5) put another kprobe on an instruction after previous probe in the same function. # echo p __secure_computing+0x12 >> kprobe_events bash: echo: write error: Invalid argument # dmesg | tail -n 1 [ 1666.500016] Probing address(0xffffffff810c19e2) is not an instruction boundary. 6) however, if the kprobes optimization is disabled, it works. # echo 0 > /proc/sys/debug/kprobes-optimization # cat ../kprobes/list ffffffff810c19d9 k __secure_computing+0x9 # echo p __secure_computing+0x12 >> kprobe_events (no error) This is because kprobes doesn't recover the instruction which is overwritten with a relative jump by another kprobe when finding instruction boundary. It only recovers the breakpoint instruction. This patch fixes kprobes to recover such instructions. With this fix: # echo p __secure_computing+0x9 > kprobe_events # echo 1 > events/kprobes/p___secure_computing_9/enable # cat ../kprobes/list ffffffff810c1aa9 k __secure_computing+0x9 [OPTIMIZED] # echo p __secure_computing+0x12 >> kprobe_events # cat ../kprobes/list ffffffff810c1aa9 k __secure_computing+0x9 [OPTIMIZED] ffffffff810c1ab2 k __secure_computing+0x12 [DISABLED] Changes in v4: - Fix a bug to ensure optimized probe is really optimized by jump. - Remove kprobe_optready() dependency. - Cleanup code for preparing optprobe separation. Changes in v3: - Fix a build error when CONFIG_OPTPROBE=n. (Thanks, Ingo!) To fix the error, split optprobe instruction recovering path from kprobes path. - Cleanup comments/styles. Changes in v2: - Fix a bug to recover original instruction address in RIP-relative instruction fixup. - Moved on tip/master. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133209.5982.36568.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 140 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7da647d8b64c..6bec22f514b5 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -207,13 +207,15 @@ retry: } } -/* Recover the probed instruction at addr for further analysis. */ -static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, + unsigned long addr) { struct kprobe *kp; + kp = get_kprobe((void *)addr); + /* There is no probe, return original address */ if (!kp) - return -EINVAL; + return addr; /* * Basically, kp->ainsn.insn has an original instruction. @@ -230,14 +232,76 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) */ memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); buf[0] = kp->opcode; - return 0; + return (unsigned long)buf; +} + +#ifdef CONFIG_OPTPROBES +static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} +#else +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + return addr; +} +#endif + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + */ +static unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); } /* Check if paddr is at an instruction boundary */ static int __kprobes can_probe(unsigned long paddr) { - int ret; - unsigned long addr, offset = 0; + unsigned long addr, __addr, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -247,26 +311,24 @@ static int __kprobes can_probe(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { - kernel_insn_init(&insn, (void *)addr); - insn_get_opcode(&insn); - /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, addr); - if (ret) - /* - * Another debugging subsystem might insert - * this breakpoint. In that case, we can't - * recover it. - */ - return 0; - kernel_insn_init(&insn, buf); - } + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); insn_get_length(&insn); + + /* + * Another debugging subsystem might insert this breakpoint. + * In that case, we can't recover it. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; addr += insn.length; } @@ -302,21 +364,17 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { struct insn insn; - int ret; kprobe_opcode_t buf[MAX_INSN_SIZE]; + u8 *orig_src = src; /* Back up original src for RIP calculation */ + + if (recover) + src = (u8 *)recover_probed_instruction(buf, (unsigned long)src); kernel_insn_init(&insn, src); - if (recover) { - insn_get_opcode(&insn); - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, - (unsigned long)src); - if (ret) - return 0; - kernel_insn_init(&insn, buf); - } - } insn_get_length(&insn); + /* Another subsystem puts a breakpoint, failed to recover */ + if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; memcpy(dest, insn.kaddr, insn.length); #ifdef CONFIG_X86_64 @@ -337,8 +395,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - - (u8 *) dest; + newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; @@ -1271,8 +1328,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) /* Decode whole function to ensure any instructions don't jump into target */ static int __kprobes can_optimize(unsigned long paddr) { - int ret; - unsigned long addr, size = 0, offset = 0; + unsigned long addr, __addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -1301,15 +1357,12 @@ static int __kprobes can_optimize(unsigned long paddr) * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)addr); - insn_get_opcode(&insn); - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, addr); - if (ret) - return 0; - kernel_insn_init(&insn, buf); - } + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); @@ -1366,6 +1419,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) /* * Copy replacing target instructions * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. */ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) { -- cgit v1.2.3 From 464846888d9aad186cab3acdae6b654f9eb19772 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:16 +0900 Subject: x86/kprobes: Fix a bug which can modify kernel code permanently Fix a bug in kprobes which can modify kernel code permanently at run-time. In the result, kernel can crash when it executes the modified code. This bug can happen when we put two probes enough near and the first probe is optimized. When the second probe is set up, it copies a byte which is already modified by the first probe, and executes it when the probe is hit. Even worse, the first probe and the second probe are removed respectively, the second probe writes back the copied (modified) instruction. To fix this bug, kprobes always recovers the original code and copies the first byte from recovered instruction. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133215.5982.31991.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 6bec22f514b5..ca6d450bee7e 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -361,19 +361,15 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) * If not, return null. * Only applicable to 64-bit x86. */ -static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) +static int __kprobes __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - u8 *orig_src = src; /* Back up original src for RIP calculation */ - if (recover) - src = (u8 *)recover_probed_instruction(buf, (unsigned long)src); - - kernel_insn_init(&insn, src); + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; memcpy(dest, insn.kaddr, insn.length); @@ -395,7 +391,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; @@ -406,18 +402,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) static void __kprobes arch_copy_kprobe(struct kprobe *p) { + /* Copy an instruction with recovering if other optprobe modifies it.*/ + __copy_instruction(p->ainsn.insn, p->addr); + /* - * Copy an instruction without recovering int3, because it will be - * put by another subsystem. + * __copy_instruction can modify the displacement of the instruction, + * but it doesn't affect boostable check. */ - __copy_instruction(p->ainsn.insn, p->addr, 0); - - if (can_boost(p->addr)) + if (can_boost(p->ainsn.insn)) p->ainsn.boostable = 0; else p->ainsn.boostable = -1; - p->opcode = *p->addr; + /* Also, displacement change doesn't affect the first byte */ + p->opcode = p->ainsn.insn[0]; } int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -1276,7 +1274,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len, 1); + ret = __copy_instruction(dest + len, src + len); if (!ret || !can_boost(dest + len)) return -EINVAL; len += ret; @@ -1328,7 +1326,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) /* Decode whole function to ensure any instructions don't jump into target */ static int __kprobes can_optimize(unsigned long paddr) { - unsigned long addr, __addr, size = 0, offset = 0; + unsigned long addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -1357,8 +1355,7 @@ static int __kprobes can_optimize(unsigned long paddr) * we can't optimize kprobe in this function. */ return 0; - __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) -- cgit v1.2.3 From 3f33ab1c0c741bfab2138c14ba1918a7905a1e8b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:22 +0900 Subject: x86/kprobes: Split out optprobe related code to kprobes-opt.c Split out optprobe related code to arch/x86/kernel/kprobes-opt.c for maintenanceability. Signed-off-by: Masami Hiramatsu Suggested-by: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133222.5982.54794.stgit@localhost.localdomain [ Tidied up the code a tiny bit ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes-common.h | 102 +++++++ arch/x86/kernel/kprobes-opt.c | 512 ++++++++++++++++++++++++++++++++ arch/x86/kernel/kprobes.c | 625 ++------------------------------------- 4 files changed, 646 insertions(+), 594 deletions(-) create mode 100644 arch/x86/kernel/kprobes-common.h create mode 100644 arch/x86/kernel/kprobes-opt.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a9..532d2e090e6f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_OPTPROBES) += kprobes-opt.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h new file mode 100644 index 000000000000..3230b68ef29a --- /dev/null +++ b/arch/x86/kernel/kprobes-common.h @@ -0,0 +1,102 @@ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(kprobe_opcode_t *instruction); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *from, void *to); +extern void synthesize_relcall(void *from, void *to); + +#ifdef CONFIG_OPTPROBES +extern int arch_init_optprobes(void); +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else /* !CONFIG_OPTPROBES */ +static inline int arch_init_optprobes(void) +{ + return 0; +} +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + return addr; +} +#endif +#endif diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c new file mode 100644 index 000000000000..c5e410eed403 --- /dev/null +++ b/arch/x86/kernel/kprobes-opt.c @@ -0,0 +1,512 @@ +/* + * Kernel Probes Jump Optimization (Optprobes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "kprobes-common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +static void __used __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end:\n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; + + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + local_irq_save(flags); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __this_cpu_write(current_kprobe, &op->kp); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + local_irq_restore(flags); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes +arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +int __kprobes +setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __kprobes arch_init_optprobes(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index ca6d450bee7e..e213fc8408d2 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -30,16 +30,15 @@ * and Prasanna S Panchamukhi * added function-return probes. * 2005-May Rusty Lynch - * Added function return probes functionality + * Added function return probes functionality * 2006-Feb Masami Hiramatsu added - * kprobe-booster and kretprobe-booster for i386. + * kprobe-booster and kretprobe-booster for i386. * 2007-Dec Masami Hiramatsu added kprobe-booster - * and kretprobe-booster for x86-64 + * and kretprobe-booster for x86-64 * 2007-Dec Masami Hiramatsu , Arjan van de Ven - * and Jim Keniston - * unified x86 kprobes code. + * and Jim Keniston + * unified x86 kprobes code. */ - #include #include #include @@ -59,6 +58,8 @@ #include #include +#include "kprobes-common.h" + void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; @@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { doesn't switch kernel stack.*/ {NULL, NULL} /* Terminator */ }; + const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) @@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes synthesize_reljump(void *from, void *to) +void __kprobes synthesize_reljump(void *from, void *to) { __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + /* * Skip the prefixes of the instruction. */ @@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) * Returns non-zero if opcode is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -static int __kprobes can_boost(kprobe_opcode_t *opcodes) +int __kprobes can_boost(kprobe_opcode_t *opcodes) { kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; @@ -207,8 +215,8 @@ retry: } } -static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, - unsigned long addr) +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; @@ -235,59 +243,12 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, return (unsigned long)buf; } -#ifdef CONFIG_OPTPROBES -static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, - unsigned long addr) -{ - struct optimized_kprobe *op; - struct kprobe *kp; - long offs; - int i; - - for (i = 0; i < RELATIVEJUMP_SIZE; i++) { - kp = get_kprobe((void *)addr - i); - /* This function only handles jump-optimized kprobe */ - if (kp && kprobe_optimized(kp)) { - op = container_of(kp, struct optimized_kprobe, kp); - /* If op->list is not empty, op is under optimizing */ - if (list_empty(&op->list)) - goto found; - } - } - - return addr; -found: - /* - * If the kprobe can be optimized, original bytes which can be - * overwritten by jump destination address. In this case, original - * bytes must be recovered from op->optinsn.copied_insn buffer. - */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - if (addr == (unsigned long)kp->addr) { - buf[0] = kp->opcode; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - } else { - offs = addr - (unsigned long)kp->addr - 1; - memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); - } - - return (unsigned long)buf; -} -#else -static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, - unsigned long addr) -{ - return addr; -} -#endif - /* * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. */ -static unsigned long recover_probed_instruction(kprobe_opcode_t *buf, - unsigned long addr) +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { unsigned long __addr; @@ -361,7 +322,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) * If not, return null. * Only applicable to 64-bit x86. */ -static int __kprobes __copy_instruction(u8 *dest, u8 *src) +int __kprobes __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -497,8 +458,8 @@ static void __kprobes restore_btf(void) } } -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) +void __kprobes +arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long *sara = stack_addr(regs); @@ -508,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } -#ifdef CONFIG_OPTPROBES -static int __kprobes setup_detour_execution(struct kprobe *p, - struct pt_regs *regs, - int reenter); -#else -#define setup_detour_execution(p, regs, reenter) (0) -#endif - -static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, int reenter) +static void __kprobes +setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { if (setup_detour_execution(p, regs, reenter)) return; @@ -559,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. */ -static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) +static int __kprobes +reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: @@ -655,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } -#ifdef CONFIG_X86_64 -#define SAVE_REGS_STRING \ - /* Skip cs, ip, orig_ax. */ \ - " subq $24, %rsp\n" \ - " pushq %rdi\n" \ - " pushq %rsi\n" \ - " pushq %rdx\n" \ - " pushq %rcx\n" \ - " pushq %rax\n" \ - " pushq %r8\n" \ - " pushq %r9\n" \ - " pushq %r10\n" \ - " pushq %r11\n" \ - " pushq %rbx\n" \ - " pushq %rbp\n" \ - " pushq %r12\n" \ - " pushq %r13\n" \ - " pushq %r14\n" \ - " pushq %r15\n" -#define RESTORE_REGS_STRING \ - " popq %r15\n" \ - " popq %r14\n" \ - " popq %r13\n" \ - " popq %r12\n" \ - " popq %rbp\n" \ - " popq %rbx\n" \ - " popq %r11\n" \ - " popq %r10\n" \ - " popq %r9\n" \ - " popq %r8\n" \ - " popq %rax\n" \ - " popq %rcx\n" \ - " popq %rdx\n" \ - " popq %rsi\n" \ - " popq %rdi\n" \ - /* Skip orig_ax, ip, cs */ \ - " addq $24, %rsp\n" -#else -#define SAVE_REGS_STRING \ - /* Skip cs, ip, orig_ax and gs. */ \ - " subl $16, %esp\n" \ - " pushl %fs\n" \ - " pushl %es\n" \ - " pushl %ds\n" \ - " pushl %eax\n" \ - " pushl %ebp\n" \ - " pushl %edi\n" \ - " pushl %esi\n" \ - " pushl %edx\n" \ - " pushl %ecx\n" \ - " pushl %ebx\n" -#define RESTORE_REGS_STRING \ - " popl %ebx\n" \ - " popl %ecx\n" \ - " popl %edx\n" \ - " popl %esi\n" \ - " popl %edi\n" \ - " popl %ebp\n" \ - " popl %eax\n" \ - /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ - " addl $24, %esp\n" -#endif - /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -871,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) * jump instruction after the copied instruction, that jumps to the next * instruction after the probepoint. */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static void __kprobes +resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; @@ -1051,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) /* * Wrapper routine for handling exceptions. */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +int __kprobes +kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = data; int ret = NOTIFY_DONE; @@ -1162,462 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } - -#ifdef CONFIG_OPTPROBES - -/* Insert a call instruction at address 'from', which calls address 'to'.*/ -static void __kprobes synthesize_relcall(void *from, void *to) -{ - __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); -} - -/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ -static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, - unsigned long val) -{ -#ifdef CONFIG_X86_64 - *addr++ = 0x48; - *addr++ = 0xbf; -#else - *addr++ = 0xb8; -#endif - *(unsigned long *)addr = val; -} - -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( - ".global optprobe_template_entry\n" - "optprobe_template_entry: \n" -#ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rsi\n" - ".global optprobe_template_val\n" - "optprobe_template_val: \n" - ASM_NOP5 - ASM_NOP5 - ".global optprobe_template_call\n" - "optprobe_template_call: \n" - ASM_NOP5 - /* Move flags to rsp */ - " movq 144(%rsp), %rdx\n" - " movq %rdx, 152(%rsp)\n" - RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" - " popfq\n" -#else /* CONFIG_X86_32 */ - " pushf\n" - SAVE_REGS_STRING - " movl %esp, %edx\n" - ".global optprobe_template_val\n" - "optprobe_template_val: \n" - ASM_NOP5 - ".global optprobe_template_call\n" - "optprobe_template_call: \n" - ASM_NOP5 - RESTORE_REGS_STRING - " addl $4, %esp\n" /* skip cs */ - " popf\n" -#endif - ".global optprobe_template_end\n" - "optprobe_template_end: \n"); -} - -#define TMPL_MOVE_IDX \ - ((long)&optprobe_template_val - (long)&optprobe_template_entry) -#define TMPL_CALL_IDX \ - ((long)&optprobe_template_call - (long)&optprobe_template_entry) -#define TMPL_END_IDX \ - ((long)&optprobe_template_end - (long)&optprobe_template_entry) - -#define INT3_SIZE sizeof(kprobe_opcode_t) - -/* Optimized kprobe call back function: called from optinsn */ -static void __kprobes optimized_callback(struct optimized_kprobe *op, - struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - - /* This is possible if op is under delayed unoptimizing */ - if (kprobe_disabled(&op->kp)) - return; - - local_irq_save(flags); - if (kprobe_running()) { - kprobes_inc_nmissed_count(&op->kp); - } else { - /* Save skipped registers */ -#ifdef CONFIG_X86_64 - regs->cs = __KERNEL_CS; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); - regs->gs = 0; -#endif - regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; - regs->orig_ax = ~0UL; - - __this_cpu_write(current_kprobe, &op->kp); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - opt_pre_handler(&op->kp, regs); - __this_cpu_write(current_kprobe, NULL); - } - local_irq_restore(flags); -} - -static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) -{ - int len = 0, ret; - - while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) - return -EINVAL; - len += ret; - } - /* Check whether the address range is reserved */ - if (ftrace_text_reserved(src, src + len - 1) || - alternatives_text_reserved(src, src + len - 1) || - jump_label_text_reserved(src, src + len - 1)) - return -EBUSY; - - return len; -} - -/* Check whether insn is indirect jump */ -static int __kprobes insn_is_indirect_jump(struct insn *insn) -{ - return ((insn->opcode.bytes[0] == 0xff && - (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ - insn->opcode.bytes[0] == 0xea); /* Segment based jump */ -} - -/* Check whether insn jumps into specified address range */ -static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) -{ - unsigned long target = 0; - - switch (insn->opcode.bytes[0]) { - case 0xe0: /* loopne */ - case 0xe1: /* loope */ - case 0xe2: /* loop */ - case 0xe3: /* jcxz */ - case 0xe9: /* near relative jump */ - case 0xeb: /* short relative jump */ - break; - case 0x0f: - if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ - break; - return 0; - default: - if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ - break; - return 0; - } - target = (unsigned long)insn->next_byte + insn->immediate.value; - - return (start <= target && target <= start + len); -} - -/* Decode whole function to ensure any instructions don't jump into target */ -static int __kprobes can_optimize(unsigned long paddr) -{ - unsigned long addr, size = 0, offset = 0; - struct insn insn; - kprobe_opcode_t buf[MAX_INSN_SIZE]; - - /* Lookup symbol including addr */ - if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) - return 0; - - /* - * Do not optimize in the entry code due to the unstable - * stack handling. - */ - if ((paddr >= (unsigned long )__entry_text_start) && - (paddr < (unsigned long )__entry_text_end)) - return 0; - - /* Check there is enough space for a relative jump. */ - if (size - offset < RELATIVEJUMP_SIZE) - return 0; - - /* Decode instructions */ - addr = paddr - offset; - while (addr < paddr - offset + size) { /* Decode until function end */ - if (search_exception_tables(addr)) - /* - * Since some fixup code will jumps into this function, - * we can't optimize kprobe in this function. - */ - return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); - insn_get_length(&insn); - /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) - return 0; - /* Recover address */ - insn.kaddr = (void *)addr; - insn.next_byte = (void *)(addr + insn.length); - /* Check any instructions don't jump into target */ - if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_SIZE, - RELATIVE_ADDR_SIZE)) - return 0; - addr += insn.length; - } - - return 1; -} - -/* Check optimized_kprobe can actually be optimized. */ -int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) -{ - int i; - struct kprobe *p; - - for (i = 1; i < op->optinsn.size; i++) { - p = get_kprobe(op->kp.addr + i); - if (p && !kprobe_disabled(p)) - return -EEXIST; - } - - return 0; -} - -/* Check the addr is within the optimized instructions. */ -int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) -{ - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + op->optinsn.size > addr); -} - -/* Free optimized instruction slot */ -static __kprobes -void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) -{ - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); - op->optinsn.insn = NULL; - op->optinsn.size = 0; - } -} - -void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) -{ - __arch_remove_optimized_kprobe(op, 1); -} - -/* - * Copy replacing target instructions - * Target instructions MUST be relocatable (checked inside) - * This is called when new aggr(opt)probe is allocated or reused. - */ -int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) -{ - u8 *buf; - int ret; - long rel; - - if (!can_optimize((unsigned long)op->kp.addr)) - return -EILSEQ; - - op->optinsn.insn = get_optinsn_slot(); - if (!op->optinsn.insn) - return -ENOMEM; - - /* - * Verify if the address gap is in 2GB range, because this uses - * a relative jump. - */ - rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; - if (abs(rel) > 0x7fffffff) - return -ERANGE; - - buf = (u8 *)op->optinsn.insn; - - /* Copy instructions into the out-of-line buffer */ - ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); - if (ret < 0) { - __arch_remove_optimized_kprobe(op, 0); - return ret; - } - op->optinsn.size = ret; - - /* Copy arch-dep-instance from template */ - memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); - - /* Set probe information */ - synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); - - /* Set probe function call */ - synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); - - /* Set returning jmp instruction at the tail of out-of-line buffer */ - synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, - (u8 *)op->kp.addr + op->optinsn.size); - - flush_icache_range((unsigned long) buf, - (unsigned long) buf + TMPL_END_IDX + - op->optinsn.size + RELATIVEJUMP_SIZE); - return 0; -} - -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Replace breakpoints (int3) with relative jumps. - * Caller must call with locking kprobe_mutex and text_mutex. - */ -void __kprobes arch_optimize_kprobes(struct list_head *oplist) -{ - struct optimized_kprobe *op, *tmp; - int c = 0; - - list_for_each_entry_safe(op, tmp, oplist, list) { - WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); - list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; - } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - /* Set int3 to first byte for kprobes */ - insn_buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Recover original instructions and breakpoints from relative jumps. - * Caller must call with locking kprobe_mutex. - */ -extern void arch_unoptimize_kprobes(struct list_head *oplist, - struct list_head *done_list) -{ - struct optimized_kprobe *op, *tmp; - int c = 0; - - list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); - list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; - } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); -} - -static int __kprobes setup_detour_execution(struct kprobe *p, - struct pt_regs *regs, - int reenter) -{ - struct optimized_kprobe *op; - - if (p->flags & KPROBE_FLAG_OPTIMIZED) { - /* This kprobe is really able to run optimized path. */ - op = container_of(p, struct optimized_kprobe, kp); - /* Detour through copied instructions */ - regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; - if (!reenter) - reset_current_kprobe(); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static int __kprobes init_poke_params(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} -#else /* !CONFIG_OPTPROBES */ -static int __kprobes init_poke_params(void) -{ - return 0; -} -#endif - int __init arch_init_kprobes(void) { - return init_poke_params(); + return arch_init_optprobes(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) -- cgit v1.2.3 From b11e3d782b9c065b3b2fb543bfb0d97801822dc0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 7 Mar 2012 11:44:29 +0100 Subject: x86, mce: Fix rcu splat in drain_mce_log_buffer() While booting, the following message is seen: [ 21.665087] =============================== [ 21.669439] [ INFO: suspicious RCU usage. ] [ 21.673798] 3.2.0-0.0.0.28.36b5ec9-default #2 Not tainted [ 21.681353] ------------------------------- [ 21.685864] arch/x86/kernel/cpu/mcheck/mce.c:194 suspicious rcu_dereference_index_check() usage! [ 21.695013] [ 21.695014] other info that might help us debug this: [ 21.695016] [ 21.703488] [ 21.703489] rcu_scheduler_active = 1, debug_locks = 1 [ 21.710426] 3 locks held by modprobe/2139: [ 21.714754] #0: (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x53/0xa0 [ 21.725020] #1: [ 21.725323] ioatdma: Intel(R) QuickData Technology Driver 4.00 [ 21.733206] (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x61/0xa0 [ 21.743015] #2: (i7core_edac_lock){+.+.+.}, at: [] i7core_probe+0x1f/0x5c0 [i7core_edac] [ 21.753708] [ 21.753709] stack backtrace: [ 21.758429] Pid: 2139, comm: modprobe Not tainted 3.2.0-0.0.0.28.36b5ec9-default #2 [ 21.768253] Call Trace: [ 21.770838] [] lockdep_rcu_suspicious+0xcd/0x100 [ 21.777366] [] drain_mcelog_buffer+0x191/0x1b0 [ 21.783715] [] mce_register_decode_chain+0x18/0x20 [ 21.790430] [] i7core_register_mci+0x2fb/0x3e4 [i7core_edac] [ 21.798003] [] i7core_probe+0xd4/0x5c0 [i7core_edac] [ 21.804809] [] local_pci_probe+0x5b/0xe0 [ 21.810631] [] __pci_device_probe+0xd9/0xe0 [ 21.816650] [] ? get_device+0x14/0x20 [ 21.822178] [] pci_device_probe+0x36/0x60 [ 21.828061] [] really_probe+0x7a/0x2b0 [ 21.833676] [] driver_probe_device+0x63/0xc0 [ 21.839868] [] __driver_attach+0x9b/0xa0 [ 21.845718] [] ? driver_probe_device+0xc0/0xc0 [ 21.852027] [] bus_for_each_dev+0x68/0x90 [ 21.857876] [] driver_attach+0x1c/0x20 [ 21.863462] [] bus_add_driver+0x16d/0x2b0 [ 21.869377] [] driver_register+0x7c/0x160 [ 21.875220] [] __pci_register_driver+0x6a/0xf0 [ 21.881494] [] ? 0xffffffffa01fdfff [ 21.886846] [] i7core_init+0x47/0x1000 [i7core_edac] [ 21.893737] [] do_one_initcall+0x3e/0x180 [ 21.899670] [] sys_init_module+0xc5/0x220 [ 21.905542] [] system_call_fastpath+0x16/0x1b Fix this by using ACCESS_ONCE() instead of rcu_dereference_check_mce() over mcelog.next. Since the access to each entry is controlled by the ->finished field, ACCESS_ONCE() should work just fine. An rcu_dereference is unnecessary here. Signed-off-by: Srivatsa S. Bhat Suggested-by: Paul E. McKenney Cc: Tony Luck Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..db590aff874c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void) { unsigned int next, i, prev = 0; - next = rcu_dereference_check_mce(mcelog.next); + next = ACCESS_ONCE(mcelog.next); do { struct mce *m; -- cgit v1.2.3 From c7e23289a6aa95048a78b252b462f24ca6cf7f96 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:23:14 +0000 Subject: x86/32: Print control and debug registers for kerenel context While for a user mode register dump it may be reasonable to skip those (albeit x86-64 doesn't do so), for kernel mode dumps these should be printed to make sure all information possibly necessary for analysis is available. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F58889202000078000770E7@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed013d5..88ec9129271d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_regs(regs, 0); + __show_regs(regs, !user_mode_vm(regs)); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), -- cgit v1.2.3 From a240ada241dafe290e7532d1ddeb98fdf1419068 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:24:57 +0000 Subject: x86: Include probe_roms.h in probe_roms.c ... to ensure that declarations and definitions are in sync. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F5888F902000078000770F1@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/probe_roms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e84ce31..0bc72e2069e3 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From c94082656dac74257f63e91f78d5d458ac781fa5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 9 Mar 2012 16:07:10 -0800 Subject: x86: Use enum instead of literals for trap values The traps are referred to by their numbers and it can be difficult to understand them while reading the code without context. This patch adds enumeration of the trap numbers and replaces the numbers with the correct enum for x86. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20120310000710.GA32667@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/traps.h | 25 +++++++++ arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 123 +++++++++++++++++++++++-------------------- 3 files changed, 91 insertions(+), 59 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0012d0902c5f..88eae2aec619 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -89,4 +89,29 @@ asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +/* Interrupts/Exceptions */ +enum { + X86_TRAP_DE = 0, /* 0, Divide-by-zero */ + X86_TRAP_DB, /* 1, Debug */ + X86_TRAP_NMI, /* 2, Non-maskable Interrupt */ + X86_TRAP_BP, /* 3, Breakpoint */ + X86_TRAP_OF, /* 4, Overflow */ + X86_TRAP_BR, /* 5, Bound Range Exceeded */ + X86_TRAP_UD, /* 6, Invalid Opcode */ + X86_TRAP_NM, /* 7, Device Not Available */ + X86_TRAP_DF, /* 8, Double Fault */ + X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */ + X86_TRAP_TS, /* 10, Invalid TSS */ + X86_TRAP_NP, /* 11, Segment Not Present */ + X86_TRAP_SS, /* 12, Stack Segment Fault */ + X86_TRAP_GP, /* 13, General Protection Fault */ + X86_TRAP_PF, /* 14, Page Fault */ + X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */ + X86_TRAP_MF, /* 16, x87 Floating-Point Exception */ + X86_TRAP_AC, /* 17, Alignment Check */ + X86_TRAP_MC, /* 18, Machine Check */ + X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */ + X86_TRAP_IRET = 32, /* 32, IRET Exception */ +}; + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..7b77062dea11 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,7 +61,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0, 0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error(get_irq_regs(), 0, 16); + math_error(get_irq_regs(), 0, X86_TRAP_MF); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d96744..037fc2bc5316 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -119,7 +119,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ - if (trapnr < 6) + if (trapnr < X86_TRAP_UD) goto vm86_trap; goto trap_signal; } @@ -203,27 +203,31 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, + regs->ip) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, + regs->ip) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", + coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, + BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) + X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) return; preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); preempt_conditional_cli(regs); } @@ -233,10 +237,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) struct task_struct *tsk = current; /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; + tsk->thread.trap_no = X86_TRAP_DF; /* * This is always a kernel trap and never fixable (and thus must @@ -264,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code) goto gp_in_kernel; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; + tsk->thread.trap_no = X86_TRAP_GP; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { @@ -291,9 +295,9 @@ gp_in_kernel: return; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) + tsk->thread.trap_no = X86_TRAP_GP; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) return; die("general protection fault", regs, error_code); } @@ -302,13 +306,13 @@ gp_in_kernel: dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; /* @@ -317,7 +321,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) */ debug_stack_usage_inc(); preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); } @@ -422,8 +426,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, 1); + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, + X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); return; @@ -460,7 +464,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) struct task_struct *task = current; siginfo_t info; unsigned short err; - char *str = (trapnr == 16) ? "fpu exception" : "simd exception"; + char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : + "simd exception"; if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; @@ -485,7 +490,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) info.si_signo = SIGFPE; info.si_errno = 0; info.si_addr = (void __user *)regs->ip; - if (trapnr == 16) { + if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -529,10 +534,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) info.si_code = FPE_FLTRES; } else { /* - * If we're using IRQ 13, or supposedly even some trap 16 - * implementations, it's possible we get a spurious trap... + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. */ - return; /* Spurious trap, no error */ + return; } force_sig_info(SIGFPE, &info, task); } @@ -543,13 +549,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) ignore_fpu_irq = 1; #endif - math_error(regs, error_code, 16); + math_error(regs, error_code, X86_TRAP_MF); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - math_error(regs, error_code, 19); + math_error(regs, error_code, X86_TRAP_XF); } dotraplinkage void @@ -643,20 +649,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_errno = 0; info.si_code = ILL_BADSTK; info.si_addr = NULL; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); } #endif /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - set_intr_gate(14, &page_fault); + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_intr_gate(X86_TRAP_PF, &page_fault); load_idt(&idt_descr); } @@ -672,30 +679,30 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_intr_gate(0, ÷_error); - set_intr_gate_ist(2, &nmi, NMI_STACK); + set_intr_gate(X86_TRAP_DE, ÷_error); + set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); + set_system_intr_gate(X86_TRAP_OF, &overflow); + set_intr_gate(X86_TRAP_BR, &bounds); + set_intr_gate(X86_TRAP_UD, &invalid_op); + set_intr_gate(X86_TRAP_NM, &device_not_available); #ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); #else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); #endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); + set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); + set_intr_gate(X86_TRAP_TS, &invalid_TSS); + set_intr_gate(X86_TRAP_NP, &segment_not_present); + set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_GP, &general_protection); + set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); + set_intr_gate(X86_TRAP_MF, &coprocessor_error); + set_intr_gate(X86_TRAP_AC, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); + set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); #endif - set_intr_gate(19, &simd_coprocessor_error); + set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) @@ -720,7 +727,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_64 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(1, &debug); - set_nmi_gate(3, &int3); + set_nmi_gate(X86_TRAP_DB, &debug); + set_nmi_gate(X86_TRAP_BP, &int3); #endif } -- cgit v1.2.3 From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 17:09:22 +0100 Subject: sched: Cleanup cpu_active madness Stepan found: CPU0 CPUn _cpu_up() __cpu_up() boostrap() notify_cpu_starting() set_cpu_online() while (!cpu_active()) cpu_relax() smp_call_function(.wait=1) /* we find cpu_online() is true */ arch_send_call_function_ipi_mask() /* wait-forever-more */ local_irq_enable() cpu_notify(CPU_ONLINE) sched_cpu_active() set_cpu_active() Now the purpose of cpu_active is mostly with bringing down a cpu, where we mark it !active to avoid the load-balancer from moving tasks to it while we tear down the cpu. This is required because we only update the sched_domain tree after we brought the cpu-down. And this is needed so that some tasks can still run while we bring it down, we just don't want new tasks to appear. On cpu-up however the sched_domain tree doesn't yet include the new cpu, so its invisible to the load-balancer, regardless of the active state. So instead of setting the active state after we boot the new cpu (and consequently having to wait for it before enabling interrupts) set the cpu active before we set it online and avoid the whole mess. Reported-by: Stepan Moskovchenko Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins Signed-off-by: Ingo Molnar --- arch/arm/kernel/smp.c | 7 ------- arch/hexagon/kernel/smp.c | 2 -- arch/s390/kernel/smp.c | 6 ------ arch/x86/kernel/smpboot.c | 13 ------------- kernel/sched/core.c | 2 +- 5 files changed, 1 insertion(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index cdeb727527d3..d616ed51e7a7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); - while (!cpu_active(cpu)) - cpu_relax(); - - /* - * cpu_active bit is set, so it's safe to enalbe interrupts - * now. - */ local_irq_enable(); local_fiq_enable(); diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index c871a2cffaef..0123c63e9a3a 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -179,8 +179,6 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); - while (!cpumask_test_cpu(cpu, cpu_active_mask)) - cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2398ce6b15ae..b0e28c47ab83 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ - /* - * Wait until the cpu which brought this one up marked it - * active before enabling interrupts. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); - /* - * Wait until the cpu which brought this one up marked it - * online before enabling interrupts. If we don't do that then - * we can end up waking up the softirq thread before this cpu - * reached the active state, which makes the scheduler unhappy - * and schedule the softirq thread on the wrong cpu. This is - * only observable with forced threaded interrupts, but in - * theory it could also happen w/o them. It's just way harder - * to achieve. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); - /* enable local interrupts */ local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95545126be1c..b1ccce819ce2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From f9b4eeb809c6d031cc9561cc34dd691701cb2c2a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Mar 2012 12:44:35 +0100 Subject: perf/x86: Prettify pmu config literals I got somewhat tired of having to decode hex numbers.. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Stephane Eranian Cc: Robert Richter Link: http://lkml.kernel.org/n/tip-0vsy1sgywc4uar3mu1szm0rg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 23 +++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 21 ++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 82db83b5c3bc..66fda0c26402 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -268,6 +268,29 @@ struct x86_pmu_quirk { void (*func)(void); }; +union x86_pmu_config { + struct { + u64 event:8, + umask:8, + usr:1, + os:1, + edge:1, + pc:1, + interrupt:1, + __reserved1:1, + en:1, + inv:1, + cmask:8, + event2:4, + __reserved2:4, + go:1, + ho:1; + } bits; + u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value + /* * struct x86_pmu - generic x86 pmu */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 61d4f79a550e..4bd9c9ef9d42 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1288,7 +1288,8 @@ static int intel_pmu_hw_config(struct perf_event *event) * * Thereby we gain a PEBS capable cycle counter. */ - u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; @@ -1690,9 +1691,11 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_nehalem_extra_regs; /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); x86_add_quirk(intel_nehalem_quirk); @@ -1727,9 +1730,11 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); pr_cont("Westmere events, "); break; @@ -1750,9 +1755,11 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); pr_cont("SandyBridge events, "); break; -- cgit v1.2.3 From 73d63d038ee9f769f5e5b46792d227fe20e442c5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 12 Mar 2012 11:36:33 -0700 Subject: x86/ioapic: Add register level checks to detect bogus io-apic entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the recent changes to clear_IO_APIC_pin() which tries to clear remoteIRR bit explicitly, some of the users started to see "Unable to reset IRR for apic .." messages. Close look shows that these are related to bogus IO-APIC entries which return's all 1's for their io-apic registers. And the above mentioned error messages are benign. But kernel should have ignored such io-apic's in the first place. Check if register 0, 1, 2 of the listed io-apic are all 1's and ignore such io-apic. Reported-by: Álvaro Castillo Tested-by: Jon Dufresne Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: kernel-team@fedoraproject.org Cc: Josh Boyer Cc: Link: http://lkml.kernel.org/r/1331577393.31585.94.camel@sbsiddha-desk.sc.intel.com [ Performed minor cleanup of affected code. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..6d10a66fc5a9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " - "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, nr_ioapics); return 1; } if (!address) { - printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); + pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); return 1; } return 0; } +static __init int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; @@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return; + } + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); ioapics[idx].mp_config.apicver = io_apic_get_version(idx); @@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) if (gsi_cfg->gsi_end >= gsi_top) gsi_top = gsi_cfg->gsi_end + 1; - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } -- cgit v1.2.3 From 51e7dc7011c99e1e5294658c7b551b92ca069985 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 12 Mar 2012 14:55:55 +0530 Subject: x86: Rename trap_no to trap_nr in thread_struct There are precedences of trap number being referred to as trap_nr. However thread struct refers trap number as trap_no. Change it to trap_nr. Also use enum instead of left-over literals for trap values. This is pure cleanup, no functional change intended. Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120312092555.5379.942.sendpatchset@srdronam.in.ibm.com [ Fixed the math-emu build ] Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/ptrace.c | 3 ++- arch/x86/kernel/signal.c | 2 +- arch/x86/kernel/traps.c | 16 ++++++++-------- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/math-emu/fpu_entry.c | 5 +++-- arch/x86/mm/fault.c | 10 +++++----- 10 files changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bc09ed2a8b97..45b4fdd4e1da 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -345,7 +345,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, put_user_ex(regs->dx, &sc->dx); put_user_ex(regs->cx, &sc->cx); put_user_ex(regs->ax, &sc->ax); - put_user_ex(current->thread.trap_no, &sc->trapno); + put_user_ex(current->thread.trap_nr, &sc->trapno); put_user_ex(current->thread.error_code, &sc->err); put_user_ex(regs->ip, &sc->ip); put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 02ce0b379647..f6d0d2eb0832 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -453,7 +453,7 @@ struct thread_struct { unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; - unsigned long trap_no; + unsigned long trap_nr; unsigned long error_code; /* floating point and extended processor state */ struct fpu fpu; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4025fe4f928f..28f98706b08b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,7 +265,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; show_registers(regs); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 93e7877a19c4..6fb330adc7c7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "tls.h" @@ -1425,7 +1426,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, int error_code, int si_code, struct siginfo *info) { - tsk->thread.trap_no = 1; + tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; memset(info, 0, sizeof(*info)); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c3846b6fb726..9c73acc1c860 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -150,7 +150,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->r15, &sc->r15); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_no, &sc->trapno); + put_user_ex(current->thread.trap_nr, &sc->trapno); put_user_ex(current->thread.error_code, &sc->err); put_user_ex(regs->ip, &sc->ip); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 037fc2bc5316..c6d17ad59b8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -132,7 +132,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, trap_signal: #endif /* - * We want error_code and trap_no set for userspace faults and + * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the @@ -141,7 +141,7 @@ trap_signal: * delivered, faults. See also do_general_protection below. */ tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && @@ -164,7 +164,7 @@ trap_signal: kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -240,7 +240,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_DF; + tsk->thread.trap_nr = X86_TRAP_DF; /* * This is always a kernel trap and never fixable (and thus must @@ -268,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code) goto gp_in_kernel; tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_GP; + tsk->thread.trap_nr = X86_TRAP_GP; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { @@ -295,7 +295,7 @@ gp_in_kernel: return; tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_GP; + tsk->thread.trap_nr = X86_TRAP_GP; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) return; @@ -475,7 +475,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -485,7 +485,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) * Save the info for the exception handler and clear the error. */ save_init_fpu(task); - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..a1315ab2d6b9 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -567,7 +567,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) } if (trapno != 1) return 1; /* we let this handle by the calling routine */ - current->thread.trap_no = trapno; + current->thread.trap_nr = trapno; current->thread.error_code = error_code; force_sig(SIGTRAP, current); return 0; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba9393564..327509b95e0e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -153,7 +153,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->error_code = 6; /* user fault, no page, write */ thread->cr2 = ptr; - thread->trap_no = 14; + thread->trap_nr = X86_TRAP_PF; memset(&info, 0, sizeof(info)); info.si_signo = SIGSEGV; diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 7718541541d4..9b868124128d 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -269,7 +270,7 @@ void math_emulate(struct math_emu_info *info) FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ RE_ENTRANT_CHECK_OFF; - current->thread.trap_no = 16; + current->thread.trap_nr = X86_TRAP_MF; current->thread.error_code = 0; send_sig(SIGFPE, current, 1); return; @@ -662,7 +663,7 @@ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, void math_abort(struct math_emu_info *info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; - current->thread.trap_no = 16; + current->thread.trap_nr = X86_TRAP_MF; current->thread.error_code = 0; send_sig(signal, current, 1); RE_ENTRANT_CHECK_OFF; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f0b4caf85c1a..3ecfd1aaf214 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -615,7 +615,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) @@ -636,7 +636,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { if (current_thread_info()->sig_on_uaccess_error && signal) { - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; @@ -676,7 +676,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; sig = SIGKILL; @@ -754,7 +754,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, /* Kernel addresses are always protection faults: */ tsk->thread.cr2 = address; tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); @@ -838,7 +838,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.cr2 = address; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { -- cgit v1.2.3 From 9993bc635d01a6ee7f6b833b4ee65ce7c06350b1 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Fri, 9 Mar 2012 16:41:01 -0800 Subject: sched/x86: Fix overflow in cyc2ns_offset When a machine boots up, the TSC generally gets reset. However, when kexec is used to boot into a kernel, the TSC value would be carried over from the previous kernel. The computation of cycns_offset in set_cyc2ns_scale is prone to an overflow, if the machine has been up more than 208 days prior to the kexec. The overflow happens when we multiply *scale, even though there is enough room to store the final answer. We fix this issue by decomposing tsc_now into the quotient and remainder of division by CYC2NS_SCALE_FACTOR and then performing the multiplication separately on the two components. Refactor code to share the calculation with the previous fix in __cycles_2_ns(). Signed-off-by: Salman Qazi Acked-by: John Stultz Acked-by: Peter Zijlstra Cc: Paul Turner Cc: john stultz Link: http://lkml.kernel.org/r/20120310004027.19291.88460.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 8 ++------ arch/x86/kernel/tsc.c | 3 ++- include/linux/kernel.h | 13 +++++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 431793e5d484..34baa0eb5d0c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -57,14 +57,10 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - unsigned long long quot; - unsigned long long rem; int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - quot = (cyc >> CYC2NS_SCALE_FACTOR); - rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1); - ns += quot * per_cpu(cyc2ns, cpu) + - ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR); + ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), + (1UL << CYC2NS_SCALE_FACTOR)); return ns; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..183c5925a9fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); } sched_clock_idle_wakeup_event(0); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8343422240a..d801acb5e680 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,6 +85,19 @@ } \ ) +/* + * Multiplies an integer by a fraction, while avoiding unnecessary + * overflow or loss of precision. + */ +#define mult_frac(x, numer, denom)( \ +{ \ + typeof(x) quot = (x) / (denom); \ + typeof(x) rem = (x) % (denom); \ + (quot * (numer)) + ((rem * (numer)) / (denom)); \ +} \ +) + + #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) -- cgit v1.2.3 From fa63030e9c79e37b4d4e63b39ffb09cfb7aa0fe4 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 14 Mar 2012 15:17:34 +0800 Subject: x86/platform: Move APIC ID validity check into platform APIC code Move APIC ID validity check into platform APIC code, so it can be overridden when needed. For NumaChip systems, always trust MADT, as it's constructed with high APIC IDs. Behaviour verifies on standard x86 systems and on NumaChip systems with this, and compile-tested with allyesconfig. Signed-off-by: Daniel J Blueman Reviewed-by: Steffen Persvold Cc: Yinghai Lu Cc: H. Peter Anvin Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1331709454-27966-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++++++ arch/x86/kernel/apic/apic_flat_64.c | 2 ++ arch/x86/kernel/apic/apic_noop.c | 1 + arch/x86/kernel/apic/apic_numachip.c | 10 +++++++++- arch/x86/kernel/apic/bigsmp_32.c | 1 + arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/apic/probe_32.c | 1 + arch/x86/kernel/apic/summit_32.c | 1 + arch/x86/kernel/apic/x2apic_cluster.c | 1 + arch/x86/kernel/apic/x2apic_phys.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/smpboot.c | 2 +- 13 files changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3ab9bdd87e79..a9371c91718c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -288,6 +288,7 @@ struct apic { int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + int (*apic_id_valid)(int apicid); int (*apic_id_registered)(void); u32 irq_delivery_mode; @@ -532,6 +533,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +static inline int default_apic_id_valid(int apicid) +{ + return x2apic_mode || (apicid < 255); +} + extern void default_setup_apic_routing(void); extern struct apic apic_noop; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2b..359b6899a36c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -180,6 +180,7 @@ static struct apic apic_flat = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -337,6 +338,7 @@ static struct apic apic_physflat = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82bc655c..634ae6cdd5c9 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -124,6 +124,7 @@ struct apic apic_noop = { .probe = noop_probe, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c1cd99..d9ea5f331ac5 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void) return get_apic_id(apic_read(APIC_ID)); } +static int numachip_apic_id_valid(int apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + static int numachip_apic_id_registered(void) { return physid_isset(read_xapic_id(), phys_cpu_present_map); @@ -223,10 +229,11 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; + setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } @@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = { .name = "NumaConnect system", .probe = numachip_probe, .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead01137..0cdec7065aff 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -198,6 +198,7 @@ static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc47b6b..e42d1d3b9134 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca1349a..00d2422ca7c9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = numaq_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3412f4..ff2c1b9aac4d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -92,6 +92,7 @@ static struct apic apic_default = { .name = "default", .probe = probe_default, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 19114423c58c..fea000b27f07 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -496,6 +496,7 @@ static struct apic apic_summit = { .name = "summit", .probe = probe_summit, .acpi_madt_oem_check = summit_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = summit_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 500795875827..9193713060a9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373dfde21e..bcd1db6eaca9 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b88aa19..fc4771425852 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..d279e6e1d1b7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -847,7 +847,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || - (!x2apic_mode && apicid >= 255)) { + !apic->apic_id_valid(apicid)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } -- cgit v1.2.3 From 6c260d586343f7f78239d90aa9e2cfed02f74ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:04 +0000 Subject: x86: vdso: Remove bogus locking in update_vsyscall_tz() Changing the sequence count in update_vsyscall_tz() is completely pointless. The vdso code copies the data unprotected. There is no point to change this as sys_tz is nowhere protected at all. See sys_gettimeofday(). Reviewed-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/kernel/vsyscall_64.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba9393564..33385c18e5d3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -80,12 +80,7 @@ early_param("vsyscall", vsyscall_setup); void update_vsyscall_tz(void) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, -- cgit v1.2.3 From 2ab516575f2f273b19d95140d02c54612201e80a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:04 +0000 Subject: x86: vdso: Use seqcount instead of seqlock The update of the vdso data happens under xtime_lock, so adding a nested lock is pointless. Just use a seqcount to sync the readers. Reviewed-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/kernel/vsyscall_64.c | 11 +++-------- arch/x86/vdso/vclock_gettime.c | 16 ++++++++-------- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 815285bcaceb..1f007178c813 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,7 +5,7 @@ #include struct vsyscall_gtod_data { - seqlock_t lock; + seqcount_t seq; /* open coded 'struct timespec' */ time_t wall_time_sec; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 33385c18e5d3..cdc95a707cd1 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,10 +52,7 @@ #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = -{ - .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), -}; +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -86,9 +83,7 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_seqcount_begin(&vsyscall_gtod_data.seq); /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -101,7 +96,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_seqcount_end(&vsyscall_gtod_data.seq); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 7eeb1f6188ee..944c5e5d6b6a 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -100,12 +100,12 @@ notrace static noinline int do_realtime(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); return mode; @@ -117,13 +117,13 @@ notrace static noinline int do_monotonic(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec * are all guaranteed to be nonnegative. @@ -142,10 +142,10 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); return 0; } @@ -153,12 +153,12 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); secs = gtod->wall_time_coarse.tv_sec; ns = gtod->wall_time_coarse.tv_nsec; secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec and wall_to_monotonic.tv_nsec are * guaranteed to be between 0 and NSEC_PER_SEC. -- cgit v1.2.3 From 57779dc2b3b75bee05ef5d1ada47f615f7a13932 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 21 Feb 2012 18:19:55 -0800 Subject: x86, tsc: Skip refined tsc calibration on systems with reliable TSC While running the latest Linux as guest under VMware in highly over-committed situations, we have seen cases when the refined TSC algorithm fails to get a valid tsc_start value in tsc_refine_calibration_work from multiple attempts. As a result the kernel keeps on scheduling the tsc_irqwork task for later. Subsequently after several attempts when it gets a valid start value it goes through the refined calibration and either bails out or uses the new results. Given that the kernel originally read the TSC frequency from the platform, which is the best it can get, I don't think there is much value in refining it. So for systems which get the TSC frequency from the platform we should skip the refined tsc algorithm. We can use the TSC_RELIABLE cpu cap flag to detect this, right now it is set only on VMware and for Moorestown Penwell both of which have there own TSC calibration methods. Signed-off-by: Alok N Kataria Cc: John Stultz Cc: Dirk Brandewie Cc: Alan Cox Cc: stable@kernel.org [jstultz: Reworked to simply not schedule the refining work, rather then scheduling the work and bombing out later] Signed-off-by: John Stultz --- arch/x86/kernel/tsc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..6fcfcb3865c2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -932,6 +932,16 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } + + /* + * Trust the results of the earlier calibration on systems + * exporting a reliable TSC. + */ + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + clocksource_register_khz(&clocksource_tsc, tsc_khz); + return 0; + } + schedule_delayed_work(&tsc_irqwork, 0); return 0; } -- cgit v1.2.3 From 943bc7e110f269f88dc92bbf249adbd384d35f1c Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Thu, 15 Mar 2012 12:16:28 +0100 Subject: x86: Fix section warnings Fix the following section warnings : WARNING: vmlinux.o(.text+0x49dbc): Section mismatch in reference from the function acpi_map_cpu2node() to the variable .cpuinit.data:__apicid_to_node The function acpi_map_cpu2node() references the variable __cpuinitdata __apicid_to_node. This is often because acpi_map_cpu2node lacks a __cpuinitdata annotation or the annotation of __apicid_to_node is wrong. WARNING: vmlinux.o(.text+0x49dc1): Section mismatch in reference from the function acpi_map_cpu2node() to the function .cpuinit.text:numa_set_node() The function acpi_map_cpu2node() references the function __cpuinit numa_set_node(). This is often because acpi_map_cpu2node lacks a __cpuinit annotation or the annotation of numa_set_node is wrong. WARNING: vmlinux.o(.text+0x526e77): Section mismatch in reference from the function prealloc_protection_domains() to the function .init.text:alloc_passthrough_domain() The function prealloc_protection_domains() references the function __init alloc_passthrough_domain(). This is often because prealloc_protection_domains lacks a __init annotation or the annotation of alloc_passthrough_domain is wrong. Signed-off-by: Steffen Persvold Link: http://lkml.kernel.org/r/1331810188-24785-1-git-send-email-sp@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f33ea8e..406ed77216d0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index f75e0608be5b..ae2ec929e52f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2804,7 +2804,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) * we don't need to preallocate the protection domains anymore. * For now we have to. */ -static void prealloc_protection_domains(void) +static void __init prealloc_protection_domains(void) { struct iommu_dev_data *dev_data; struct dma_ops_domain *dma_dom; -- cgit v1.2.3 From b74f05d61b73af584d0c39121980171389ecfaaa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 13 Feb 2012 11:07:27 -0200 Subject: x86: kvmclock: abstract save/restore sched_clock_state Upon resume from hibernation, CPU 0's hvclock area contains the old values for system_time and tsc_timestamp. It is necessary for the hypervisor to update these values with uptodate ones before the CPU uses them. Abstract TSC's save/restore sched_clock_state functions and use restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update. Also move restore_sched_clock_state before __restore_processor_state, since the later calls CONFIG_LOCK_STAT's lockstat_clock (also for TSC). Thanks to Igor Mammedov for tracking it down. Fixes suspend-to-disk with kvmclock. Reviewed-by: Thomas Gleixner Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/kvmclock.c | 11 +++++++++++ arch/x86/kernel/tsc.c | 4 ++-- arch/x86/kernel/x86_init.c | 4 +++- arch/x86/power/cpu.c | 4 ++-- 6 files changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 15d99153a96d..c91e8b9d588b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); -extern void save_sched_clock_state(void); -extern void restore_sched_clock_state(void); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5d0afac2962c..baaca8defec8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -162,6 +162,8 @@ struct x86_cpuinit_ops { * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus * @i8042_detect pre-detect if i8042 controller exists + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -173,6 +175,8 @@ struct x86_platform_ops { void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca4e735adc54..f8492da65bfc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -195,6 +204,8 @@ void __init kvmclock_init(void) x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..aed2aa1088f1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -629,7 +629,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -645,7 +645,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f2ec53deed0..e9f265fd79ae 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -108,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb4..0e76a2814127 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -114,7 +114,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); - save_sched_clock_state(); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -230,8 +230,8 @@ static void __restore_processor_state(struct saved_context *ctxt) /* Needed by apm.c */ void restore_processor_state(void) { + x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); - restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); -- cgit v1.2.3 From 8fd75e1216e0ba601a746177e6c102d5593b572f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:17 +0800 Subject: x86: remove the second argument of k[un]map_atomic() Acked-by: Avi Kivity Acked-by: Herbert Xu Signed-off-by: Cong Wang --- arch/x86/crypto/aesni-intel_glue.c | 24 ++++++++++++------------ arch/x86/kernel/crash_dump_32.c | 6 +++--- arch/x86/kvm/lapic.c | 8 ++++---- arch/x86/kvm/paging_tmpl.h | 4 ++-- arch/x86/kvm/x86.c | 8 ++++---- arch/x86/lib/usercopy_32.c | 4 ++-- 6 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 545d0ce59818..152232d2dc6a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1107,12 +1107,12 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk, 0); - assoc = scatterwalk_map(&assoc_sg_walk, 0); + src = scatterwalk_map(&src_sg_walk); + assoc = scatterwalk_map(&assoc_sg_walk); dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk, 0); + dst = scatterwalk_map(&dst_sg_walk); } } else { @@ -1136,11 +1136,11 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) * back to the packet. */ if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst, 0); + scatterwalk_unmap(dst); scatterwalk_done(&dst_sg_walk, 0, 0); } - scatterwalk_unmap(src, 0); - scatterwalk_unmap(assoc, 0); + scatterwalk_unmap(src); + scatterwalk_unmap(assoc); scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { @@ -1189,12 +1189,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk, 0); - assoc = scatterwalk_map(&assoc_sg_walk, 0); + src = scatterwalk_map(&src_sg_walk); + assoc = scatterwalk_map(&assoc_sg_walk); dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk, 0); + dst = scatterwalk_map(&dst_sg_walk); } } else { @@ -1219,11 +1219,11 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst, 0); + scatterwalk_unmap(dst); scatterwalk_done(&dst_sg_walk, 0, 0); } - scatterwalk_unmap(src, 0); - scatterwalk_unmap(assoc, 0); + scatterwalk_unmap(src); + scatterwalk_unmap(assoc); scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 642f75a68cd5..11891ca7b716 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!userbuf) { memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); } else { if (!kdump_buf_page) { printk(KERN_WARNING "Kdump: Kdump buffer page not" " allocated\n"); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); return -EFAULT; } copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); if (copy_to_user(buf, (kdump_buf_page + offset), csize)) return -EFAULT; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cfdc6e0ef002..31bfc6927bc0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1283,9 +1283,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) return; - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + vapic = kmap_atomic(vcpu->arch.apic->vapic_page); data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic, KM_USER0); + kunmap_atomic(vapic); apic_set_tpr(vcpu->arch.apic, data & 0xff); } @@ -1310,9 +1310,9 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + vapic = kmap_atomic(vcpu->arch.apic->vapic_page); *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic, KM_USER0); + kunmap_atomic(vapic); } void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15610285ebb6..df5a70311be8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -92,9 +92,9 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page, KM_USER0); + table = kmap_atomic(page); ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); + kunmap_atomic(table); kvm_release_page_dirty(page); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..bb4fd2636bc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1162,12 +1162,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + shared_kaddr = kmap_atomic(vcpu->time_page); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); - kunmap_atomic(shared_kaddr, KM_USER0); + kunmap_atomic(shared_kaddr); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); return 0; @@ -3848,7 +3848,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, goto emul_write; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); kaddr += offset_in_page(gpa); switch (bytes) { case 1: @@ -3866,7 +3866,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); kvm_release_page_dirty(page); if (!exchanged) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index e218d5df85ff..d9b094ca7aaa 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -760,9 +760,9 @@ survive: break; } - maddr = kmap_atomic(pg, KM_USER0); + maddr = kmap_atomic(pg); memcpy(maddr + offset, from, len); - kunmap_atomic(maddr, KM_USER0); + kunmap_atomic(maddr); set_page_dirty_lock(pg); put_page(pg); up_read(¤t->mm->mmap_sem); -- cgit v1.2.3 From 1a5a9906d4e8d1976b701f889d8f35d54b928f25 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 21 Mar 2012 16:33:42 -0700 Subject: mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode In some cases it may happen that pmd_none_or_clear_bad() is called with the mmap_sem hold in read mode. In those cases the huge page faults can allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a false positive from pmd_bad() that will not like to see a pmd materializing as trans huge. It's not khugepaged causing the problem, khugepaged holds the mmap_sem in write mode (and all those sites must hold the mmap_sem in read mode to prevent pagetables to go away from under them, during code review it seems vm86 mode on 32bit kernels requires that too unless it's restricted to 1 thread per process or UP builds). The race is only with the huge pagefaults that can convert a pmd_none() into a pmd_trans_huge(). Effectively all these pmd_none_or_clear_bad() sites running with mmap_sem in read mode are somewhat speculative with the page faults, and the result is always undefined when they run simultaneously. This is probably why it wasn't common to run into this. For example if the madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page fault, the hugepage will not be zapped, if the page fault runs first it will be zapped. Altering pmd_bad() not to error out if it finds hugepmds won't be enough to fix this, because zap_pmd_range would then proceed to call zap_pte_range (which would be incorrect if the pmd become a pmd_trans_huge()). The simplest way to fix this is to read the pmd in the local stack (regardless of what we read, no need of actual CPU barriers, only compiler barrier needed), and be sure it is not changing under the code that computes its value. Even if the real pmd is changing under the value we hold on the stack, we don't care. If we actually end up in zap_pte_range it means the pmd was not none already and it was not huge, and it can't become huge from under us (khugepaged locking explained above). All we need is to enforce that there is no way anymore that in a code path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad can run into a hugepmd. The overhead of a barrier() is just a compiler tweak and should not be measurable (I only added it for THP builds). I don't exclude different compiler versions may have prevented the race too by caching the value of *pmd on the stack (that hasn't been verified, but it wouldn't be impossible considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines and there's no external function called in between pmd_trans_huge and pmd_none_or_clear_bad). if (pmd_trans_huge(*pmd)) { if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } if (pmd_none_or_clear_bad(pmd)) Because this race condition could be exercised without special privileges this was reported in CVE-2012-1179. The race was identified and fully explained by Ulrich who debugged it. I'm quoting his accurate explanation below, for reference. ====== start quote ======= mapcount 0 page_mapcount 1 kernel BUG at mm/huge_memory.c:1384! At some point prior to the panic, a "bad pmd ..." message similar to the following is logged on the console: mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7). The "bad pmd ..." message is logged by pmd_clear_bad() before it clears the page's PMD table entry. 143 void pmd_clear_bad(pmd_t *pmd) 144 { -> 145 pmd_ERROR(*pmd); 146 pmd_clear(pmd); 147 } After the PMD table entry has been cleared, there is an inconsistency between the actual number of PMD table entries that are mapping the page and the page's map count (_mapcount field in struct page). When the page is subsequently reclaimed, __split_huge_page() detects this inconsistency. 1381 if (mapcount != page_mapcount(page)) 1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1383 mapcount, page_mapcount(page)); -> 1384 BUG_ON(mapcount != page_mapcount(page)); The root cause of the problem is a race of two threads in a multithreaded process. Thread B incurs a page fault on a virtual address that has never been accessed (PMD entry is zero) while Thread A is executing an madvise() system call on a virtual address within the same 2 MB (huge page) range. virtual address space .---------------------. | | | | .-|---------------------| | | | | | |<-- B(fault) | | | 2 MB | |/////////////////////|-. huge < |/////////////////////| > A(range) page | |/////////////////////|-' | | | | | | '-|---------------------| | | | | '---------------------' - Thread A is executing an madvise(..., MADV_DONTNEED) system call on the virtual address range "A(range)" shown in the picture. sys_madvise // Acquire the semaphore in shared mode. down_read(¤t->mm->mmap_sem) ... madvise_vma switch (behavior) case MADV_DONTNEED: madvise_dontneed zap_page_range unmap_vmas unmap_page_range zap_pud_range zap_pmd_range // // Assume that this huge page has never been accessed. // I.e. content of the PMD entry is zero (not mapped). // if (pmd_trans_huge(*pmd)) { // We don't get here due to the above assumption. } // // Assume that Thread B incurred a page fault and .---------> // sneaks in here as shown below. | // | if (pmd_none_or_clear_bad(pmd)) | { | if (unlikely(pmd_bad(*pmd))) | pmd_clear_bad | { | pmd_ERROR | // Log "bad pmd ..." message here. | pmd_clear | // Clear the page's PMD entry. | // Thread B incremented the map count | // in page_add_new_anon_rmap(), but | // now the page is no longer mapped | // by a PMD entry (-> inconsistency). | } | } | v - Thread B is handling a page fault on virtual address "B(fault)" shown in the picture. ... do_page_fault __do_page_fault // Acquire the semaphore in shared mode. down_read_trylock(&mm->mmap_sem) ... handle_mm_fault if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) // We get here due to the above assumption (PMD entry is zero). do_huge_pmd_anonymous_page alloc_hugepage_vma // Allocate a new transparent huge page here. ... __do_huge_pmd_anonymous_page ... spin_lock(&mm->page_table_lock) ... page_add_new_anon_rmap // Here we increment the page's map count (starts at -1). atomic_set(&page->_mapcount, 0) set_pmd_at // Here we set the page's PMD entry which will be cleared // when Thread A calls pmd_clear_bad(). ... spin_unlock(&mm->page_table_lock) The mmap_sem does not prevent the race because both threads are acquiring it in shared mode (down_read). Thread B holds the page_table_lock while the page's map count and PMD table entry are updated. However, Thread A does not synchronize on that lock. ====== end quote ======= [akpm@linux-foundation.org: checkpatch fixes] Reported-by: Ulrich Obergfell Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Hugh Dickins Cc: Dave Jones Acked-by: Larry Woodman Acked-by: Rik van Riel Cc: [2.6.38+] Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 2 ++ fs/proc/task_mmu.c | 9 +++++++ include/asm-generic/pgtable.h | 61 +++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 4 +++ mm/memory.c | 16 +++++++++--- mm/mempolicy.c | 2 +- mm/mincore.c | 2 +- mm/pagewalk.c | 2 +- mm/swapfile.c | 4 +-- 9 files changed, 92 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..328cb37bb827 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) spinlock_t *ptl; int i; + down_write(&mm->mmap_sem); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: + up_write(&mm->mmap_sem); flush_tlb(); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7dcd2a250495..3efa7253523e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { spin_unlock(&walk->mm->page_table_lock); } + + if (pmd_trans_unstable(pmd)) + return 0; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(&walk->mm->page_table_lock); } + if (pmd_trans_unstable(pmd)) + return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, md->vma, addr); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15e..a03c098b0cce 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifdef CONFIG_MMU + #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { @@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd) return 0; } #endif /* __HAVE_ARCH_PMD_WRITE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and + * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd + * into a null pmd and the transhuge page fault can convert a null pmd + * into an hugepmd or into a regular pmd (if the hugepage allocation + * fails). While holding the mmap_sem in read mode the pmd becomes + * stable and stops changing under us only if it's not null and not a + * transhuge pmd. When those races occurs and this function makes a + * difference vs the standard pmd_none_or_clear_bad, the result is + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. + */ +static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) +{ + /* depend on compiler for an atomic pmd read */ + pmd_t pmdval = *pmd; + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +#endif + if (pmd_none(pmdval)) + return 1; + if (unlikely(pmd_bad(pmdval))) { + if (!pmd_trans_huge(pmdval)) + pmd_clear_bad(pmd); + return 1; + } + return 0; +} + +/* + * This is a noop if Transparent Hugepage Support is not built into + * the kernel. Otherwise it is equivalent to + * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in + * places that already verified the pmd is not none and they want to + * walk ptes while holding the mmap sem in read mode (write mode don't + * need this). If THP is not enabled, the pmd can't go away under the + * code even if MADV_DONTNEED runs, but if THP is enabled we need to + * run a pmd_trans_unstable before walking the ptes after + * split_huge_page_pmd returns (because it may have run when the pmd + * become null, but then a page fault can map in a THP and not a + * regular page). + */ +static inline int pmd_trans_unstable(pmd_t *pmd) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return pmd_none_or_trans_huge_or_clear_bad(pmd); +#else + return 0; #endif +} + +#endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26c6f4ec20f4..37281816ff67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5230,6 +5230,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) @@ -5390,6 +5392,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index 347e5fad1cfa..e01abb908b6b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { + if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) - continue; + goto next; /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: cond_resched(); } while (pmd++, addr = next, addr != end); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 47296fee23db..0a3757067631 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { next = pmd_addr_end(addr, end); split_huge_page_pmd(vma->vm_mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, flags, private)) diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else mincore_pte_range(vma, pmd, addr, next, vec); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) diff --git a/mm/swapfile.c b/mm/swapfile.c index 00a962caab1a..44595a373e42 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (unlikely(pmd_trans_huge(*pmd))) - continue; - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) -- cgit v1.2.3 From b716ad953a2bc4a543143c1d9836b7007a4b182f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 21 Mar 2012 16:33:56 -0700 Subject: mm: search from free_area_cache for the bigger size If the required size is bigger than cached_hole_size it is better to search from free_area_cache - it is easier to get a free region, specifically for the 64 bit process whose address space is large enough Do it just as hugetlb_get_unmapped_area_topdown() in arch/x86/mm/hugetlbpage.c Signed-off-by: Xiao Guangrong Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/sys_x86_64.c | 34 +++++++++++++++++----------------- mm/mmap.c | 36 +++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..ef59642ff1bf 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - unsigned long tmp_addr = align_addr(addr - len, filp, - ALIGN_TOPDOWN); - - vma = find_vma(mm, tmp_addr); - if (!vma || tmp_addr + len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = tmp_addr; - } - - if (mm->mmap_base < len) - goto bottomup; + start_addr = addr = mm->free_area_cache; - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { addr = align_addr(addr, filp, ALIGN_TOPDOWN); @@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + bottomup: /* * A failed mmap() very likely causes application failure, diff --git a/mm/mmap.c b/mm/mmap.c index 4f31764d838f..9e0c0de2e7e3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1442,7 +1442,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -1466,22 +1466,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; + start_addr = addr = mm->free_area_cache; - /* make sure it can fit in the remaining address space */ - if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); - } - - if (mm->mmap_base < len) - goto bottomup; - - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { /* * Lookup failure means no vma is above this address, @@ -1501,7 +1493,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); -bottomup: +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + * + * Note: this is different with the case of bottomup + * which does the fully line-search, but we use find_vma + * here that causes some holes skipped. + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario -- cgit v1.2.3 From 639077fb69aec8112e5427210a83d0fb192969f0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 19 Mar 2012 15:16:48 -0500 Subject: kgdb: x86: Return all segment registers also in 64-bit mode Even if the content is always 0, gdb expects us to return also ds, es, fs, and gs while in x86-64 mode. Do this to avoid ugly errors on "info registers". [jason.wessel@windriver.com: adjust NUMREGBYTES for two new regs] Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- arch/x86/include/asm/kgdb.h | 10 +++++++--- arch/x86/kernel/kgdb.c | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 77e95f54570a..332f98c9111f 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -64,11 +64,15 @@ enum regnames { GDB_PS, /* 17 */ GDB_CS, /* 18 */ GDB_SS, /* 19 */ + GDB_DS, /* 20 */ + GDB_ES, /* 21 */ + GDB_FS, /* 22 */ + GDB_GS, /* 23 */ }; #define GDB_ORIG_AX 57 -#define DBG_MAX_REG_NUM 20 -/* 17 64 bit regs and 3 32 bit regs */ -#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#define DBG_MAX_REG_NUM 24 +/* 17 64 bit regs and 5 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (5 * 4)) #endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771acad..fdc37b3d0ce3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "ss", 4, offsetof(struct pt_regs, ss) }, { "ds", 4, offsetof(struct pt_regs, ds) }, { "es", 4, offsetof(struct pt_regs, es) }, - { "fs", 4, -1 }, - { "gs", 4, -1 }, #else { "ax", 8, offsetof(struct pt_regs, ax) }, { "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "flags", 4, offsetof(struct pt_regs, flags) }, { "cs", 4, offsetof(struct pt_regs, cs) }, { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, #endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, }; int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) -- cgit v1.2.3 From 29a2e2836ff9ea65a603c89df217f4198973a74f Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 22 Mar 2012 21:39:25 +0100 Subject: x86-32: Fix endless loop when processing signals for kernel tasks The problem occurs on !CONFIG_VM86 kernels [1] when a kernel-mode task returns from a system call with a pending signal. A real-life scenario is a child of 'khelper' returning from a failed kernel_execve() in ____call_usermodehelper() [ kernel/kmod.c ]. kernel_execve() fails due to a pending SIGKILL, which is the result of "kill -9 -1" (at least, busybox's init does it upon reboot). The loop is as follows: * syscall_exit_work: - work_pending: // start_of_the_loop - work_notify_sig: - do_notify_resume() - do_signal() - if (!user_mode(regs)) return; - resume_userspace // TIF_SIGPENDING is still set - work_pending // so we call work_pending => goto // start_of_the_loop More information can be found in another LKML thread: http://www.serverphorums.com/read.php?12,457826 [1] the problem was also seen on MIPS. Signed-off-by: Dmitry Adamushko Link: http://lkml.kernel.org/r/1332448765.2299.68.camel@dimm Cc: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 79d97e68f042..7b784f4ef1e4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -98,12 +98,6 @@ #endif .endm -#ifdef CONFIG_VM86 -#define resume_userspace_sig check_userspace -#else -#define resume_userspace_sig resume_userspace -#endif - /* * User gs save/restore * @@ -327,10 +321,19 @@ ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) -check_userspace: +resume_userspace_sig: +#ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from a syscall done in the kernel space, + * e.g. a failed kernel_execve(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace -- cgit v1.2.3 From 0b8b8078cb4db2ebe9a20f2b2eaeb58988be32bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 22 Mar 2012 21:31:43 -0700 Subject: x86: Fix excessive MSR print out when show_msr is not specified Dave found: | During bootup, I now have 162 messages like this.. | [ 0.227346] MSR0000001b: 00000000fee00900 | [ 0.227465] MSR00000021: 0000000000000001 | [ 0.227584] MSR0000002a: 00000000c1c81400 | | commit 21c3fcf3e39353d4f21d50e257cc74f3204b1988 looks suspect. | It claims that it will only print these out if show_msr= is | passed, but that doesn't seem to be the case. Fix it by changing to the version that checks the index. Reported-and-tested-by: Dave Jones Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1332477103-4595-1-git-send-email-yinghai@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ade9c794ed98..b24032355a76 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -998,7 +998,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); - __print_cpu_msr(); + print_cpu_msr(c); } void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) -- cgit v1.2.3 From b7157acf429e6aef690646ba964b9ebd25049ec2 Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Fri, 16 Mar 2012 20:25:35 +0100 Subject: x86/apic: Add separate apic_id_valid() functions for selected apic drivers As suggested by Suresh Siddha and Yinghai Lu: For x2apic pre-enabled systems, apic driver is set already early through early_acpi_boot_init()/early_acpi_process_madt()/ acpi_parse_madt()/default_acpi_madt_oem_check() path so that apic_id_valid() checking will be sufficient during MADT and SRAT parsing. For non-x2apic pre-enabled systems, all apic ids should be less than 255. This allows us to substitute the checks in arch/x86/kernel/acpi/boot.c::acpi_parse_x2apic() and arch/x86/mm/srat.c::acpi_numa_x2apic_affinity_init() with apic->apic_id_valid(). In addition we can avoid feigning the x2apic cpu feature in the NumaChip apic code. The following apic drivers have separate apic_id_valid() functions which will accept x2apic type IDs : x2apic_phys x2apic_cluster x2apic_uv_x apic_numachip Signed-off-by: Steffen Persvold Cc: Suresh Siddha Cc: Daniel J Blueman Cc: Yinghai Lu Cc: Jack Steiner Link: http://lkml.kernel.org/r/1331925935-13372-1-git-send-email-sp@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/x2apic.h | 5 +++++ arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++++- arch/x86/mm/srat.c | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718c..d3eaac44860a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -535,7 +535,7 @@ static inline unsigned int read_apic_id(void) static inline int default_apic_id_valid(int apicid) { - return x2apic_mode || (apicid < 255); + return (apicid < 255); } extern void default_setup_apic_routing(void); diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 6bf5b8e478c0..92e54abf89e0 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -18,6 +18,11 @@ static const struct cpumask *x2apic_target_cpus(void) return cpu_online_mask; } +static int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + static int x2apic_apic_id_registered(void) { return 1; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 406ed77216d0..0f42c2f44311 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else acpi_register_lapic(apic_id, enabled); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index d9ea5f331ac5..899803e03214 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -229,11 +229,10 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; - setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 9193713060a9..48f3103b3c93 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,7 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bcd1db6eaca9..8a778db45e3a 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,7 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index fc4771425852..87bfa69e216e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } +static int uv_apic_id_valid(int apicid) +{ + return 1; +} + static int uv_apic_id_registered(void) { return 1; @@ -351,7 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1c1c4f46a7c1..efb5b4b93711 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -70,7 +70,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; - if (!cpu_has_x2apic && (apic_id >= 0xff)) { + if (!apic->apic_id_valid(apic_id)) { printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; -- cgit v1.2.3 From 4da7072ad6831a35a11341097ce477e18651bedd Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 20 Mar 2012 15:19:36 +0100 Subject: x86/io_apic: Move and reenable irq only when CONFIG_GENERIC_PENDING_IRQ=y This patch removes dead code from certain .config variations. When CONFIG_GENERIC_PENDING_IRQ=n irq move and reenable code is never get executed, nor do_unmask_irq variable updates its init value. Move the code under CONFIG_GENERIC_PENDING_IRQ macro. Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120320141935.GA24806@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 101 +++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d10a66fc5a9..2c428c5d7ca3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2512,21 +2512,73 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; -static void ack_apic_level(struct irq_data *data) -{ - struct irq_cfg *cfg = data->chip_data; - int i, do_unmask_irq = 0, irq = data->irq; - unsigned long v; - - irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - do_unmask_irq = 1; mask_ioapic(cfg); + return true; } + return false; +} + +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ + if (unlikely(masked)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(cfg); + } +} +#else +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ + return false; +} +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ +} #endif +static void ack_apic_level(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + int i, irq = data->irq; + unsigned long v; + bool masked; + + irq_complete_move(cfg); + masked = ioapic_irqd_mask(data, cfg); + /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2581,38 +2633,7 @@ static void ack_apic_level(struct irq_data *data) eoi_ioapic_irq(irq, cfg); } - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(cfg)) - irq_move_masked_irq(data); - unmask_ioapic(cfg); - } + ioapic_irqd_unmask(data, cfg, masked); } #ifdef CONFIG_IRQ_REMAP -- cgit v1.2.3 From 91ec87d57fc38c529034e853687dfb7756de5406 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Mar 2012 21:15:51 -0700 Subject: x86-64: Simplify and optimize vdso clock_gettime monotonic variants We used to store the wall-to-monotonic offset and the realtime base. It's faster to precompute the monotonic base. This is about a 3% speedup on Sandy Bridge for CLOCK_MONOTONIC. It's much more impressive for CLOCK_MONOTONIC_COARSE. Signed-off-by: Andy Lutomirski Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 15 +++++++++------ arch/x86/kernel/vsyscall_64.c | 10 +++++++++- arch/x86/vdso/vclock_gettime.c | 38 ++++++++------------------------------ 3 files changed, 26 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 1f007178c813..8b38be2de9e1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -7,11 +7,6 @@ struct vsyscall_gtod_data { seqcount_t seq; - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - struct timezone sys_tz; struct { /* extract of a clocksource struct */ int vclock_mode; cycle_t cycle_last; @@ -19,8 +14,16 @@ struct vsyscall_gtod_data { u32 mult; u32 shift; } clock; - struct timespec wall_to_monotonic; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + u32 monotonic_time_nsec; + time_t monotonic_time_sec; + + struct timezone sys_tz; struct timespec wall_time_coarse; + struct timespec monotonic_time_coarse; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cdc95a707cd1..4285f1f404c2 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -84,6 +84,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { write_seqcount_begin(&vsyscall_gtod_data.seq); + struct timespec monotonic; /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -91,10 +92,17 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; + + monotonic = timespec_add(*wall_time, *wtm); + vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; + vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.monotonic_time_coarse = + timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); write_seqcount_end(&vsyscall_gtod_data.seq); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 944c5e5d6b6a..6eea70b8f384 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -113,27 +113,17 @@ notrace static noinline int do_realtime(struct timespec *ts) notrace static noinline int do_monotonic(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq, ns; int mode; do { seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; - secs = gtod->wall_time_sec; - ns = gtod->wall_time_nsec + vgetns(); - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_sec; + ts->tv_nsec = gtod->monotonic_time_nsec; + ns = vgetns(); } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec - * are all guaranteed to be nonnegative. - */ - while (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; + timespec_add_ns(ts, ns); return mode; } @@ -151,25 +141,13 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) notrace static noinline int do_monotonic_coarse(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq; do { seq = read_seqcount_begin(>od->seq); - secs = gtod->wall_time_coarse.tv_sec; - ns = gtod->wall_time_coarse.tv_nsec; - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; + ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - /* wall_time_nsec and wall_to_monotonic.tv_nsec are - * guaranteed to be between 0 and NSEC_PER_SEC. - */ - if (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; - return 0; } -- cgit v1.2.3 From 307b1cd7ecd7f3dc5ce3d3860957f034f0abe4df Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 23 Mar 2012 15:02:03 -0700 Subject: bitops: rename for_each_set_bit_cont() in favor of analogous list.h function This renames for_each_set_bit_cont() to for_each_set_bit_from() because it is analogous to list_for_each_entry_from() in list.h rather than list_for_each_entry_continue(). This doesn't remove for_each_set_bit_cont() for now. Signed-off-by: Akinobu Mita Cc: Robert Richter Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- include/linux/bitops.h | 5 ++++- tools/perf/util/include/linux/bitops.h | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16cb58d..fa2900c0e398 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) /* Prefer fixed purpose counters */ if (x86_pmu.num_counters_fixed) { idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 94300fe46cce..a78e358f0c17 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -27,11 +27,14 @@ extern unsigned long __sw_hweight64(__u64 w); (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +#define for_each_set_bit_cont(bit, addr, size) \ + for_each_set_bit_from(bit, addr, size) + static __inline__ int get_bitmask_order(unsigned int count) { int order; diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index 62cdee78db7b..f1584833bd22 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -15,7 +15,7 @@ (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) -- cgit v1.2.3 From 0b2f4d4d76a09f02fa37bfa57909483448fac771 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 23 Mar 2012 15:02:06 -0700 Subject: x86: use for_each_clear_bit_from() Use for_each_clear_bit() to iterate over all the cleared bit in a memory region. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/irqinit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..43e2b1cff0a7 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -306,10 +306,10 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + i = FIRST_EXTERNAL_VECTOR; + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - if (!test_bit(i, used_vectors)) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } if (!acpi_ioapic && !of_ioapic) -- cgit v1.2.3 From 65c0ff4079c011232e795e62c74a0a95512b7ac3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Mar 2012 14:02:55 -0700 Subject: x86: Stop recursive fault in print_context_stack after stack overflow After printing out the first line of a stack backtrace, print_context_stack() calls print_ftrace_graph_addr() to check if it's making a graph of function calls, usually not the case. But unfortunate ordering of assignments causes this to oops if an earlier stack overflow corrupted threadinfo->task. Reorder to avoid that irritation. ( The fact that there was a stack overflow may often be more interesting than the stack that can now be shown; but integrating that information with this stacktrace is awkward, so leave it to overflow reporting. ) Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Cc: Namhyung Kim Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120323225648.15DD5A033B@akpm.mtv.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4025fe4f928f..90bf130f09bc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, struct thread_info *tinfo, int *graph) { - struct task_struct *task = tinfo->task; + struct task_struct *task; unsigned long ret_addr; - int index = task->curr_ret_stack; + int index; if (addr != (unsigned long)return_to_handler) return; + task = tinfo->task; + index = task->curr_ret_stack; + if (!task->ret_stack || index < *graph) return; -- cgit v1.2.3 From 68fe7b23d559763a2e19e5fc1cf7036e4aaecb10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Mar 2012 09:29:22 +0100 Subject: x86: vdso: Put declaration before code Sigh, warnings are there for a reason. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: John Stultz --- arch/x86/kernel/vsyscall_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 4285f1f404c2..d5c69860b524 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -83,9 +83,10 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - write_seqcount_begin(&vsyscall_gtod_data.seq); struct timespec monotonic; + write_seqcount_begin(&vsyscall_gtod_data.seq); + /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; -- cgit v1.2.3 From 90e240142bd31ff10aeda5a280a53153f4eff004 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 25 Mar 2012 23:00:04 +0200 Subject: x86: Merge the x86_32 and x86_64 cpu_idle() functions Both functions are mostly identical. The differences are: - x86_32's cpu_idle() makes use of check_pgt_cache(), which is a nop on both x86_32 and x86_64. - x86_64's cpu_idle() uses enter/__exit_idle/(), on x86_32 these function are a nop. - In contrast to x86_32, x86_64 calls rcu_idle_enter/exit() in the innermost loop because idle notifications need RCU. Calling these function on x86_32 also in the innermost loop does not hurt. So we can merge both functions. Signed-off-by: Richard Weinberger Acked-by: Frederic Weisbecker Cc: paulmck@linux.vnet.ibm.com Cc: josh@joshtriplett.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1332709204-22496-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 1 + arch/x86/kernel/process.c | 114 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 58 ---------------------- arch/x86/kernel/process_64.c | 107 ---------------------------------------- 4 files changed, 115 insertions(+), 165 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index f49253d75710..c5d1785373ed 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -14,6 +14,7 @@ void exit_idle(void); #else /* !CONFIG_X86_64 */ static inline void enter_idle(void) { } static inline void exit_idle(void) { } +static inline void __exit_idle(void) { } #endif /* CONFIG_X86_64 */ void amd_e400_remove_cpu(int cpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1f..29309c42b9e5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -23,6 +26,24 @@ #include #include #include +#include + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -371,6 +392,99 @@ static inline int hlt_use_halt(void) } #endif +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +#ifdef CONFIG_X86_64 +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; + + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_touch_nmi(); + local_irq_disable(); + + enter_idle(); + + /* Don't trace irqs off for idle */ + stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + + if (cpuidle_idle_call()) + pm_idle(); + + rcu_idle_exit(); + start_critical_timings(); + + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + /* * We use this if we don't have any better * idle routine.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfaf..ea207c245aa4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,7 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -31,14 +30,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include @@ -58,7 +55,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -70,60 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - int cpu = smp_processor_id(); - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - current_thread_info()->status |= TS_POLLING; - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - while (!need_resched()) { - - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) - play_dead(); - - local_touch_nmi(); - local_irq_disable(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - if (cpuidle_idle_call()) - pm_idle(); - start_critical_timings(); - } - rcu_idle_exit(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5aa..ce5e34f2beca 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,7 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -32,12 +31,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -52,114 +49,10 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); -static DEFINE_PER_CPU(unsigned char, is_idle); - -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - -void enter_idle(void) -{ - percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} - -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - current_thread_info()->status |= TS_POLLING; - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - while (!need_resched()) { - - rmb(); - - if (cpu_is_offline(smp_processor_id())) - play_dead(); - /* - * Idle routines should keep interrupts disabled - * from here on, until they go to idle. - * Otherwise, idle callbacks can misfire. - */ - local_touch_nmi(); - local_irq_disable(); - enter_idle(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - - /* enter_idle() needs rcu for notifiers */ - rcu_idle_enter(); - - if (cpuidle_idle_call()) - pm_idle(); - - rcu_idle_exit(); - start_critical_timings(); - - /* In many cases the interrupt that ended idle - has already called exit_idle. But some idle - loops can be woken up without interrupt. */ - __exit_idle(); - } - - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) -- cgit v1.2.3 From bc758133ed73d4b06952bec21da23e28e62bf3ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Mar 2012 13:16:15 +0200 Subject: sched/x86/smp: Do not enable IRQs over calibrate_delay() We should not ever enable IRQs until we're fully set up. This opens up a window where interrupts can hit the cpu and interrupts can do wakeups, wakeups need state that isn't set-up yet, in particular this cpu isn't elegible to run tasks, so if any cpu-affine task that got created in CPU_UP_PREPARE manages to get a wakeup, its affinity mask will get broken and we'll run into lots of 'interesting' problems. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-yaezmlbriluh166tfkgni22m@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58f78165d308..89571a0c4a49 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -219,14 +219,9 @@ static void __cpuinit smp_callin(void) * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as * accurate as the value just calculated. - * - * Need to enable IRQs because it can take longer and then - * the NMI watchdog might kill us. */ - local_irq_enable(); calibrate_delay(); cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; - local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* -- cgit v1.2.3 From 136d249ef7dbf0fefa292082cc40be1ea864cbd6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 21 Mar 2012 22:58:08 -0400 Subject: x86/ioapic: Add io_apic_ops driver layer to allow interception Xen dom0 needs to paravirtualize IO operations to the IO APIC, so add a io_apic_ops for it to intercept. Do this as ops structure because there's at least some chance that another paravirtualized environment may want to intercept these. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Suresh Siddha Cc: jwboyer@redhat.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/1332385090-18056-2-git-send-email-konrad.wilk@oracle.com [ Made all the affected code easier on the eyes ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 9 +++++++ arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 690d1cc9a877..2c4943de5150 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -21,6 +21,15 @@ #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) +struct io_apic_ops { + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); +}; + +void __init set_io_apic_ops(const struct io_apic_ops *); + /* * The structure of the IO-APIC: */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2c428c5d7ca3..e88300d8e80a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -64,9 +64,28 @@ #include #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) +static void __init __ioapic_init_mappings(void); + +static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); +static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); + +static struct io_apic_ops io_apic_ops = { + .init = __ioapic_init_mappings, + .read = __io_apic_read, + .write = __io_apic_write, + .modify = __io_apic_modify, +}; + +void __init set_io_apic_ops(const struct io_apic_ops *ops) +{ + io_apic_ops = *ops; +} + /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -294,6 +313,22 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + return io_apic_ops.read(apic, reg); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.write(apic, reg, value); +} + +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.modify(apic, reg, value); +} + + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -314,16 +349,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -334,7 +370,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -377,6 +413,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; } @@ -384,9 +421,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; } @@ -396,8 +435,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { union entry_union eu = {{0, 0}}; @@ -409,6 +447,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -435,8 +474,7 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int -__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -521,6 +559,7 @@ static void io_apic_sync(struct irq_pin_list *entry) * a dummy read from the IO-APIC */ struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } @@ -3893,6 +3932,11 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) } void __init ioapic_and_gsi_init(void) +{ + io_apic_ops.init(); +} + +static void __init __ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; -- cgit v1.2.3 From 8f0750f19789cf352d7e24a6cc50f2ab1b4f1372 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 24 Mar 2012 10:52:50 +0300 Subject: x86, tls: Off by one limit check These are used as offsets into an array of GDT_ENTRY_TLS_ENTRIES members so GDT_ENTRY_TLS_ENTRIES is one past the end of the array. Signed-off-by: Dan Carpenter Link: http://lkml.kernel.org/r/20120324075250.GA28258@elgon.mountain Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e70..bcfec2d23769 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -163,7 +163,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset, { const struct desc_struct *tls; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; @@ -198,7 +198,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; -- cgit v1.2.3 From f05e798ad4c09255f590f5b2c00a7ca6c172f983 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:11:12 +0100 Subject: Disintegrate asm/system.h for X86 Disintegrate asm/system.h for X86. Signed-off-by: David Howells Acked-by: H. Peter Anvin cc: x86@kernel.org --- arch/x86/ia32/ia32_aout.c | 1 - arch/x86/include/asm/apic.h | 1 - arch/x86/include/asm/auxvec.h | 7 + arch/x86/include/asm/barrier.h | 116 +++++++ arch/x86/include/asm/bug.h | 4 + arch/x86/include/asm/cacheflush.h | 1 + arch/x86/include/asm/elf.h | 1 - arch/x86/include/asm/exec.h | 1 + arch/x86/include/asm/futex.h | 1 - arch/x86/include/asm/i387.h | 1 - arch/x86/include/asm/local.h | 1 - arch/x86/include/asm/mc146818rtc.h | 1 - arch/x86/include/asm/processor.h | 31 +- arch/x86/include/asm/segment.h | 58 +++- arch/x86/include/asm/special_insns.h | 199 ++++++++++++ arch/x86/include/asm/stackprotector.h | 1 - arch/x86/include/asm/switch_to.h | 129 ++++++++ arch/x86/include/asm/system.h | 527 +------------------------------ arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/include/asm/virtext.h | 1 - arch/x86/kernel/acpi/cstate.c | 1 + arch/x86/kernel/apm_32.c | 1 - arch/x86/kernel/cpu/mcheck/p5.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - arch/x86/kernel/cpu/mcheck/winchip.c | 1 - arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpuid.c | 1 - arch/x86/kernel/i8259.c | 1 - arch/x86/kernel/irqinit.c | 1 - arch/x86/kernel/kgdb.c | 1 - arch/x86/kernel/ldt.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/mca_32.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/msr.c | 1 - arch/x86/kernel/paravirt.c | 1 + arch/x86/kernel/pci-calgary_64.c | 1 - arch/x86/kernel/process.c | 1 - arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/kernel/tce_64.c | 1 + arch/x86/kernel/tls.c | 1 - arch/x86/kernel/traps.c | 1 - arch/x86/mm/init.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/power/hibernate_32.c | 1 - 50 files changed, 554 insertions(+), 562 deletions(-) create mode 100644 arch/x86/include/asm/barrier.h create mode 100644 arch/x86/include/asm/exec.h create mode 100644 arch/x86/include/asm/special_insns.h create mode 100644 arch/x86/include/asm/switch_to.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 4c2e59a420b9..d511d951a052 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718c..4b2caeefe1a2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #define ARCH_APICTIMER_STOPS_ON_C3 1 diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h index 1316b4c35425..77203ac352de 100644 --- a/arch/x86/include/asm/auxvec.h +++ b/arch/x86/include/asm/auxvec.h @@ -9,4 +9,11 @@ #endif #define AT_SYSINFO_EHDR 33 +/* entries in ARCH_DLINFO: */ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) +# define AT_VECTOR_SIZE_ARCH 2 +#else /* else it's non-compat x86-64 */ +# define AT_VECTOR_SIZE_ARCH 1 +#endif + #endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h new file mode 100644 index 000000000000..c6cd358a1eec --- /dev/null +++ b/arch/x86/include/asm/barrier.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_BARRIER_H +#define _ASM_X86_BARRIER_H + +#include +#include + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ + +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f654d1bb17fb..11e1152222d0 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -36,4 +36,8 @@ do { \ #endif /* !CONFIG_BUG */ #include + + +extern void show_regs_common(void); + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 4e12668711e5..9863ee3747da 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -3,6 +3,7 @@ /* Caches aren't brain-dead on the intel. */ #include +#include #ifdef CONFIG_X86_PAT /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df30d0f..f27f79abe021 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -84,7 +84,6 @@ extern unsigned int vdso_enabled; (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include -#include #ifdef CONFIG_X86_32 #include diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h new file mode 100644 index 000000000000..54c2e1db274a --- /dev/null +++ b/arch/x86/include/asm/exec.h @@ -0,0 +1 @@ +/* define arch_align_stack() here */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index d09bb03653f0..71ecbcba1a4e 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,7 +9,6 @@ #include #include #include -#include #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ asm volatile("1:\t" insn "\n" \ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7ce0798b1b26..257d9cca214f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -14,7 +14,6 @@ #include #include -#include struct pt_regs; struct user_i387_struct; diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 9cdae5d47e8f..c8bed0da434a 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -3,7 +3,6 @@ #include -#include #include #include diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0e8e85bb7c51..d354fb781c57 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -5,7 +5,6 @@ #define _ASM_X86_MC146818RTC_H #include -#include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 95da14f7ee85..78e30ea492b2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -14,13 +14,13 @@ struct mm_struct; #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -29,6 +29,15 @@ struct mm_struct; #include #include #include +#include + +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 #define HBP_NUM 4 /* @@ -1022,4 +1031,24 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ +#ifdef CONFIG_X86_32 +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#endif + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); +bool set_pm_idle_to_default(void); + +void stop_this_cpu(void *dummy); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5e641715c3fe..165466233ab0 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -212,7 +212,61 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; -#endif -#endif + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +#endif /* !__ASSEMBLY__ */ +#endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h new file mode 100644 index 000000000000..41fc93a2e225 --- /dev/null +++ b/arch/x86/include/asm/special_insns.h @@ -0,0 +1,199 @@ +#ifndef _ASM_X86_SPECIAL_INSNS_H +#define _ASM_X86_SPECIAL_INSNS_H + + +#ifdef __KERNEL__ + +static inline void native_clts(void) +{ + asm volatile("clts"); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist. In x86_64, a cr4 always + * exists, so it will never fail. */ +#ifdef CONFIG_X86_32 + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b, 2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); +#else + val = native_read_cr4(); +#endif + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long native_read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void native_write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} +#endif + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +extern void native_load_gs_index(unsigned); + +#ifdef CONFIG_PARAVIRT +#include +#else + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + +#endif + +/* Clear the 'TS' bit */ +static inline void clts(void) +{ + native_clts(); +} + +#endif/* CONFIG_PARAVIRT */ + +#define stts() write_cr0(read_cr0() | X86_CR0_TS) + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 157517763565..b5d9533d2c38 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h new file mode 100644 index 000000000000..4ec45b3abba1 --- /dev/null +++ b/arch/x86/include/asm/switch_to.h @@ -0,0 +1,129 @@ +#ifndef _ASM_X86_SWITCH_TO_H +#define _ASM_X86_SWITCH_TO_H + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +struct tss_struct; +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (stack_canary.canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +#else /* CONFIG_X86_32 */ + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (irq_stack_union.stack_canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (current_task) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) + +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2d2f01ce6dcb..0d84f9e42fde 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -1,523 +1,6 @@ -#ifndef _ASM_X86_SYSTEM_H -#define _ASM_X86_SYSTEM_H - -#include -#include -#include +/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ +#include #include -#include - -#include -#include - -/* entries in ARCH_DLINFO: */ -#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) -# define AT_VECTOR_SIZE_ARCH 2 -#else /* else it's non-compat x86-64 */ -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); -struct tss_struct; -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss); -extern void show_regs_common(void); - -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ -#define switch_to(prev, next, last) \ -do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ -} while (0) - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#else - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* Save restore flags to clear handle leaking NT */ -#define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) -#endif - -#ifdef __KERNEL__ - -extern void native_load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg, value) \ -do { \ - unsigned short __val = (value); \ - \ - asm volatile(" \n" \ - "1: movl %k0,%%" #seg " \n" \ - \ - ".section .fixup,\"ax\" \n" \ - "2: xorl %k0,%k0 \n" \ - " jmp 1b \n" \ - ".previous \n" \ - \ - _ASM_EXTABLE(1b, 2b) \ - \ - : "+r" (__val) : : "memory"); \ -} while (0) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm("mov %%" #seg ",%0":"=r" (value) : : "memory") - -/* - * x86_32 user gs accessors. - */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ -#endif /* X86_32 */ - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - -static inline void native_clts(void) -{ - asm volatile("clts"); -} - -/* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization - */ -static unsigned long __force_order; - -static inline unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr2(unsigned long val) -{ - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr3(unsigned long val) -{ - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ -#ifdef CONFIG_X86_32 - asm volatile("1: mov %%cr4, %0\n" - "2:\n" - _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); -#else - val = native_read_cr4(); -#endif - return val; -} - -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} - -#ifdef CONFIG_X86_64 -static inline unsigned long native_read_cr8(void) -{ - unsigned long cr8; - asm volatile("movq %%cr8,%0" : "=r" (cr8)); - return cr8; -} - -static inline void native_write_cr8(unsigned long val) -{ - asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); -} -#endif - -static inline void native_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -#ifdef CONFIG_PARAVIRT -#include -#else - -static inline unsigned long read_cr0(void) -{ - return native_read_cr0(); -} - -static inline void write_cr0(unsigned long x) -{ - native_write_cr0(x); -} - -static inline unsigned long read_cr2(void) -{ - return native_read_cr2(); -} - -static inline void write_cr2(unsigned long x) -{ - native_write_cr2(x); -} - -static inline unsigned long read_cr3(void) -{ - return native_read_cr3(); -} - -static inline void write_cr3(unsigned long x) -{ - native_write_cr3(x); -} - -static inline unsigned long read_cr4(void) -{ - return native_read_cr4(); -} - -static inline unsigned long read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - -static inline void write_cr4(unsigned long x) -{ - native_write_cr4(x); -} - -static inline void wbinvd(void) -{ - native_wbinvd(); -} - -#ifdef CONFIG_X86_64 - -static inline unsigned long read_cr8(void) -{ - return native_read_cr8(); -} - -static inline void write_cr8(unsigned long x) -{ - native_write_cr8(x); -} - -static inline void load_gs_index(unsigned selector) -{ - native_load_gs_index(selector); -} - -#endif - -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - -#endif/* CONFIG_PARAVIRT */ - -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - -#endif /* __KERNEL__ */ - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); -} - -#define nop() asm volatile ("nop") - -void disable_hlt(void); -void enable_hlt(void); - -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - -void default_idle(void); -bool set_pm_idle_to_default(void); - -void stop_this_cpu(void *dummy); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -/* - * We handle most unaligned accesses in hardware. On the other hand - * unaligned DMA can be quite expensive on some Nehalem processors. - * - * Based on this we disable the IP header alignment in network drivers. - */ -#define NET_IP_ALIGN 0 -#endif /* _ASM_X86_SYSTEM_H */ +#include +#include +#include diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 169be8938b96..c0e108e08079 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,7 +5,7 @@ #include #include -#include +#include #ifdef CONFIG_PARAVIRT #include diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index e0f9aa16358b..5da71c27cc59 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -16,7 +16,6 @@ #define _ASM_X86_VIRTEX_H #include -#include #include #include diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f50e7fb2a201..d2b7f27781bc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d56931a15b3..459e78cbf61e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -231,7 +231,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9bc..2d5454cd2c4f 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 67bb17a37a0a..47a1870279aa 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f565974..2d7998fb628c 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 97b26356e9ee..75772ae6c65f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f2..39472dd2323f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,7 +43,6 @@ #include #include -#include static struct class *cpuid_class; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 610485223bdb..36d1853e91af 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..99b85b423bbf 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771acad..4425a12ece43 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ea697263b373..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43ba5d3b..5b19e4d78b00 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 177183cbb6ae..7eb1e2b97827 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 925179f871de..f21fd94ac897 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 96356762a51d..eb113693f043 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,6 @@ #include #include -#include static struct class *msr_class; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ada2f99388dd..2b26485f0c11 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,6 +37,7 @@ #include #include #include +#include /* nop stub */ void _paravirt_nop(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b58345..6ac5782f4d6b 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1f..9b24f36eb55f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfaf..aae4f4bbbe88 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include @@ -59,6 +58,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5aa..61270e8d428a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,7 +40,6 @@ #include #include -#include #include #include #include @@ -53,6 +52,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 78f05e438be5..8a634c887652 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..8cbeb7209c3e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -90,7 +90,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 9e540fee7009..ab40954e113e 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -34,6 +34,7 @@ #include #include #include +#include /* flush a tce at 'tceaddr' to main memory */ static inline void flush_tce(void* tceaddr) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e70..73920e4c6dc5 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec61d4c1b93b..860f126ca233 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf6570d64..4f0cec7e4ffb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8663f6c47ccb..575d86f85ce4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a0309db33..fc18be0f6f29 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac718499256..a69bcb8c7621 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 3769079874d8..74202c1910cd 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 8abc3122aa02567bfe626cd13f4d34853c9b1225 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 27 Mar 2012 20:04:02 +0200 Subject: x86/apic/amd: Be more verbose about LVT offset assignments Add information about LVT offset assignments to better debug firmware bugs related to this. See following examples. # dmesg | grep -i 'offset\|ibs' LVT offset 0 assigned for vector 0xf9 [Firmware Bug]: cpu 0, try to use APIC500 (LVT offset 0) for vector 0x10400, but the register is already in use for vector 0xf9 on another cpu [Firmware Bug]: cpu 0, IBS interrupt offset 0 not available (MSRC001103A=0x0000000000000100) Failed to setup IBS, -22 In this case the BIOS assigns both offsets for MCE (0xf9) and IBS (0x400) vectors to offset 0, which is why the second APIC setup (IBS) failed. With correct setup you get: # dmesg | grep -i 'offset\|ibs' LVT offset 0 assigned for vector 0xf9 LVT offset 1 assigned for vector 0x400 IBS: LVT offset 1 assigned perf: AMD IBS detected (0x00000007) oprofile: AMD IBS detected (0x00000007) Note: The vector includes also the message type to handle also NMIs (0x400). In the firmware bug message the format is the same as of the APIC500 register and includes the mask bit (bit 16) in addition. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2eec05b6d1b8..11544d8f1e97 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -383,20 +383,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) static unsigned int reserve_eilvt_offset(int offset, unsigned int new) { - unsigned int rsvd; /* 0: uninitialized */ + unsigned int rsvd, vector; if (offset >= APIC_EILVT_NR_MAX) return ~0; - rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; + rsvd = atomic_read(&eilvt_offsets[offset]); do { - if (rsvd && - !eilvt_entry_is_changeable(rsvd, new)) + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); } while (rsvd != new); + rsvd &= ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + return new; } -- cgit v1.2.3 From 09c71bfd8384278c42f56380365940508194cec0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kdump x86: fix total mem size calculation for reservation crashkernel reservation need know the total memory size. Current get_total_mem simply use max_pfn - min_low_pfn. It is wrong because it will including memory holes in the middle. Especially for kvm guest with memory > 0xe0000000, there's below in qemu code: qemu split memory as below: if (ram_size >= 0xe0000000 ) { above_4g_mem_size = ram_size - 0xe0000000; below_4g_mem_size = 0xe0000000; } else { below_4g_mem_size = ram_size; } So for 4G mem guest, seabios will insert a 512M usable region beyond of 4G. Thus in above case max_pfn - min_low_pfn will be more than original memsize. Fixing this issue by using memblock_phys_mem_size() to get the total memsize. Signed-off-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..ab77aae4ad9b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -509,15 +509,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -536,7 +527,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); -- cgit v1.2.3