From 13503fa9137d9708d52214e9506c671dbf2fbdce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Mar 2009 17:39:20 +0900 Subject: x86, mce: Cleanup param parser - Fix the comment formatting. - The error path does not return 0, and printk lacks level and "\n". - Move __setup("nomce") next to mcheck_disable(). - Improve readability etc. [ Impact: cleanup ] Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen LKML-Reference: <49CB3F38.7090703@jp.fujitsu.com> Signed-off-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a5..77effb55afe7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -839,25 +839,29 @@ static int __init mcheck_disable(char *str) mce_dont_init = 1; return 1; } +__setup("nomce", mcheck_disable); -/* mce=off disables machine check. - mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - mce=nobootlog Don't log MCEs from before booting. */ +/* + * mce=off disables machine check + * mce=TOLERANCELEVEL (number, see above) + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) - mce_bootlog = str[0] == 'b'; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) get_option(&str, &tolerant); - else - printk("mce= argument %s ignored. Please use /sys", str); + else { + printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + str); + return 0; + } return 1; } - -__setup("nomce", mcheck_disable); __setup("mce=", mcheck_enable); /* -- cgit v1.2.3 From e9eee03e99d519599eb615c3e251d5f6cc4be57d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: x86, mce: clean up mce_64.c This file has been modified many times along the years, by multiple authors, so the general style and structure has diverged in a number of areas making this file hard to read. So fix the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 247 ++++++++++++++++++++++-------------- 1 file changed, 149 insertions(+), 98 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 77effb55afe7..1491246c4d69 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1,46 +1,47 @@ /* * Machine check handler. + * * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. * Copyright 2008 Intel Corporation * Author: Andi Kleen */ - -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include +#include #include -#include +#include +#include +#include +#include +#include +#include + #include -#include -#include #include -#include #include +#include +#include +#include -#define MISC_MCELOG_MINOR 227 +#define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; +static int mce_dont_init; /* * Tolerant levels: @@ -49,16 +50,16 @@ static int mce_dont_init; * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; +static int tolerant = 1; +static int banks; +static u64 *bank; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = -1; +static atomic_t mce_events; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); @@ -89,19 +90,23 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference(mcelog.next); for (;;) { - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ if (entry >= MCE_LOG_LEN) { set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } - /* Old left over entry. Skip. */ + /* Old left over entry. Skip: */ if (mcelog.entry[entry].finished) { entry++; continue; @@ -264,12 +269,12 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! */ -void do_machine_check(struct pt_regs * regs, long error_code) +void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; + int panicm_found = 0; u64 mcestart = 0; int i; - int panicm_found = 0; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -293,6 +298,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_setup(&m); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; @@ -356,23 +362,29 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + /* + * Did this bank cause the exception? + * + * Assume that the bank with uncorrectable errors did it, + * and that there is only a single one: + */ + if ((m.status & MCI_STATUS_UC) && + (m.status & MCI_STATUS_EN)) { panicm = m; panicm_found = 1; } } - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ + /* + * If we didn't find an uncorrectable error, pick + * the last one (shouldn't happen, just being safe). + */ if (!panicm_found) panicm = m; /* * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. + * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); @@ -451,10 +463,9 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ - static int check_interval = 5 * 60; /* 5 minutes */ + static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -464,9 +475,10 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) + if (mce_available(¤t_cpu_data)) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + } /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -501,6 +513,7 @@ int mce_notify_user(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { wake_up_interruptible(&mce_wait); @@ -520,9 +533,10 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace */ +/* see if the idle task needs to notify userspace: */ static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +mce_idle_callback(struct notifier_block *nfb, unsigned long action, + void *unused) { /* IDLE_END should be safe - interrupts are back on */ if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) @@ -532,7 +546,7 @@ mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) } static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, + .notifier_call = mce_idle_callback, }; static __init int periodic_mcheck_init(void) @@ -547,8 +561,8 @@ __initcall(periodic_mcheck_init); */ static int mce_cap_init(void) { - u64 cap; unsigned b; + u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; @@ -578,9 +592,9 @@ static int mce_cap_init(void) static void mce_init(void *dummy) { + mce_banks_t all_banks; u64 cap; int i; - mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. @@ -605,14 +619,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ clear_bit(10, (unsigned long *)&bank[4]); - if(c->x86 <= 17 && mce_bootlog < 0) - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ mce_bootlog = 0; + } } } @@ -646,7 +667,7 @@ static void mce_init_timer(void) /* * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. + * Must be called with preempt off: */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { @@ -669,8 +690,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) */ static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { @@ -680,6 +701,7 @@ static int mce_open(struct inode *inode, struct file *file) if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); unlock_kernel(); + return -EBUSY; } @@ -712,13 +734,14 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static DEFINE_MUTEX(mce_read_mutex); + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { + char __user *buf = ubuf; unsigned long *cpu_tsc; - static DEFINE_MUTEX(mce_read_mutex); unsigned prev, next; - char __user *buf = ubuf; int i, err; cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); @@ -732,6 +755,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return -EINVAL; } @@ -770,6 +794,7 @@ timeout: * synchronize. */ on_each_cpu(collect_tscs, cpu_tsc, 1); + for (i = next; i < MCE_LOG_LEN; i++) { if (mcelog.entry[i].finished && mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { @@ -782,6 +807,7 @@ timeout: } mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return err ? -EFAULT : buf - ubuf; } @@ -799,6 +825,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + switch (cmd) { case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); @@ -810,6 +837,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) do { flags = mcelog.flags; } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); } default: @@ -818,11 +846,11 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -891,13 +919,16 @@ static int mce_shutdown(struct sys_device *dev) return mce_disable(); } -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. - Only one CPU is active at this time, the others get readded later using - CPU hotplug. */ +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ static int mce_resume(struct sys_device *dev) { mce_init(NULL); mce_cpu_features(¤t_cpu_data); + return 0; } @@ -916,14 +947,16 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", }; DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -937,9 +970,12 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ + \ + if (end == buf) \ + return -EINVAL; \ var = new; \ start; \ + \ return end-buf; \ } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); @@ -950,6 +986,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%llx\n", b); } @@ -958,15 +995,18 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, { char *end; u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) return -EINVAL; + bank[attr - bank_attrs] = new; mce_restart(); + return end-buf; } -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); @@ -974,21 +1014,27 @@ static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf,size_t siz) + const char *buf, size_t siz) { char *p; int len; + strncpy(trigger, buf, sizeof(trigger)); trigger[sizeof(trigger)-1] = 0; len = strlen(trigger); p = strchr(trigger, '\n'); - if (*p) *p = 0; + + if (*p) + *p = 0; + return len; } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) + +ACCESSOR(check_interval, check_interval, mce_restart()) + static struct sysdev_attribute *mce_attributes[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL @@ -996,7 +1042,7 @@ static struct sysdev_attribute *mce_attributes[] = { static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) { int err; @@ -1006,15 +1052,15 @@ static __cpuinit int mce_create_device(unsigned int cpu) return -EIO; memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; + per_cpu(device_mce, cpu).id = cpu; + per_cpu(device_mce, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce,cpu)); + err = sysdev_register(&per_cpu(device_mce, cpu)); if (err) return err; for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce,cpu), + err = sysdev_create_file(&per_cpu(device_mce, cpu), mce_attributes[i]); if (err) goto error; @@ -1035,10 +1081,10 @@ error2: } error: while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); } - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); return err; } @@ -1051,12 +1097,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) return; for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); for (i = 0; i < banks; i++) sysdev_remove_file(&per_cpu(device_mce, cpu), &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); cpumask_clear_cpu(cpu, mce_device_initialized); } @@ -1076,11 +1122,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) @@ -1088,8 +1135,8 @@ static void mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); @@ -1142,12 +1189,14 @@ static __init int mce_init_banks(void) for (i = 0; i < banks; i++) { struct sysdev_attribute *a = &bank_attrs[i]; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); if (!a->attr.name) goto nomem; - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; } return 0; @@ -1156,6 +1205,7 @@ nomem: kfree(bank_attrs[i].attr.name); kfree(bank_attrs); bank_attrs = NULL; + return -ENOMEM; } @@ -1185,6 +1235,7 @@ static __init int mce_init_device(void) register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); + return err; } -- cgit v1.2.3 From 3b58dfd04bdfa52e717ead8f3c7622610eb7f950 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: x86, mce: clean up mce_32.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 3552119b091d..05979e7eff12 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -2,13 +2,12 @@ * mce.c - x86 Machine Check Exception Reporting * (c) 2002 Alan Cox , Dave Jones */ - -#include -#include +#include #include #include +#include +#include #include -#include #include #include @@ -17,18 +16,20 @@ #include "mce.h" int mce_disabled; -int nr_mce_banks; +int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) -- cgit v1.2.3 From c5aaf0e0702513637278ca4e27a156caa9392817 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: clean up p4.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 73 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf382..cb344aba479c 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -2,18 +2,17 @@ * P4 specific Machine Check Exception Reporting */ -#include -#include -#include #include +#include +#include +#include #include +#include #include #include -#include #include - -#include +#include #include "mce.h" @@ -36,6 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL + static void unexpected_thermal_interrupt(struct pt_regs *regs) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) add_taint(TAINT_MACHINE_CHECK); } -/* P4/Xeon Thermal transition interrupt handler */ +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(struct pt_regs *regs) { __u64 msr_val; @@ -54,8 +54,9 @@ static void intel_thermal_interrupt(struct pt_regs *regs) therm_throt_process(msr_val & 0x1); } -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = + unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { @@ -65,67 +66,76 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -/* P4/Xeon Thermal regulation detect and init */ +/* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; unsigned int cpu = smp_processor_id(); + u32 l, h; - /* Thermal monitoring */ + /* Thermal monitoring: */ if (!cpu_has(c, X86_FEATURE_ACPI)) return; /* -ENODEV */ - /* Clock modulation */ + /* Clock modulation: */ if (!cpu_has(c, X86_FEATURE_ACC)) return; /* -ENODEV */ - /* first check if its enabled already, in which case there might + /* + * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already -zwanem. + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", - cpu); + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; /* -EBUSY */ } - /* check whether a vector already exists, temporarily masked? */ + /* Check whether a vector already exists, temporarily masked? */ if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", - cpu, (h & APIC_VECTOR_MASK)); + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ } - /* The temperature transition interrupt handler setup */ - h = THERMAL_APIC_VECTOR; /* our delivery vector */ - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + /* + * The temperature transition interrupt handler setup: + */ + + /* Our delivery vector: */ + h = THERMAL_APIC_VECTOR; + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h |= APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - /* ok we're good to go... */ + /* Ok, we're good to go... */ vendor_thermal_interrupt = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; @@ -143,9 +153,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +167,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +183,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; @@ -196,6 +209,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); + /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +231,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } - void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; -- cgit v1.2.3 From ed8bc7ed9a2ad875617b24d2ba09e49ee886638c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: x86, mce: clean up p5.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p5.c | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69edc..8812f5441830 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,11 +2,10 @@ * P5 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,39 +14,53 @@ #include "mce.h" -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); - if (lotype&(1<<5)) - printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + + printk(KERN_EMERG + "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + printk(KERN_EMERG + "CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + add_taint(TAINT_MACHINE_CHECK); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /*Check for MCE support */ + /* Check for MCE support: */ if (!cpu_has(c, X86_FEATURE_MCE)) return; - /* Default P5 to off as its often misconnected */ + /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); - /* Read registers before enabling */ + /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + printk(KERN_INFO + "Intel old style machine check architecture supported.\n"); - /* Enable MCE */ + /* Enable MCE: */ set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + printk(KERN_INFO + "Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } -- cgit v1.2.3 From ea2566ff80e096eeecc0918fb5d5a4612d8f62ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: x86, mce: clean up p6.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p6.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434b..43c24e667457 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,11 +2,10 @@ * P6 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -18,9 +17,9 @@ /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); } @@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. + * for errors if the OS could not log the error: */ for (i = 0; i < nr_mce_banks; i++) { unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; rdmsr(msr, low, high); if (high & (1<<31)) { - /* Clear it */ + /* Clear it: */ wrmsr(msr, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } @@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) /* Ok machine check is available */ machine_check_vector = intel_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); -- cgit v1.2.3 From efee4ca80980f97b60e91e3322c3342f19623eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: x86, mce: clean up k7.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/k7.c | 42 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39a..89e510424152 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,11 +2,10 @@ * Athlon specific Machine Check Exception Reporting * (C) Copyright 2002 Dave Jones */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,12 +14,12 @@ #include "mce.h" -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) for (i = 1; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high&(1<<31)) { + if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); - /* Clear it */ + + /* Clear it: */ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } } - if (recover&2) + if (recover & 2) panic("CPU context corrupt"); - if (recover&1) + if (recover & 1) panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */ void amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) return; machine_check_vector = k7_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; - /* Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled. */ + /* + * Clear status for MC index 0 separately, we don't touch CTL, + * as some K7 Athlons cause spurious MCEs when its enabled: + */ if (boot_cpu_data.x86 == 6) { wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; + for (; i < nr_mce_banks; i++) { wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); -- cgit v1.2.3 From 91425084f74c0ad087b3fb6bdad79a825f952720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:22 +0200 Subject: x86, mce: clean up winchip.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/winchip.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811d..81b02487090b 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,11 +2,10 @@ * IDT Winchip specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -14,7 +13,7 @@ #include "mce.h" -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + + printk(KERN_INFO + "Winchip machine check reporting enabled on CPU#0.\n"); } -- cgit v1.2.3 From bdbfbdd5e8f0efb9bfef2e597f8ac673c36317ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: x86, mce: clean up non-fatal.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/non-fatal.c | 57 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc9..70b710420f74 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,15 +6,14 @@ * This file contains routines to check for non-fatal MCEs every 15s * */ - -#include -#include -#include -#include -#include #include -#include +#include +#include +#include #include +#include +#include +#include #include #include @@ -22,9 +21,9 @@ #include "mce.h" -static int firstbank; +static int firstbank; -#define MCE_RATE 15*HZ /* timer rate is 15s */ +#define MCE_RATE (15*HZ) /* timer rate is 15s */ static void mce_checkregs(void *info) { @@ -34,23 +33,24 @@ static void mce_checkregs(void *info) for (i = firstbank; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - printk(KERN_INFO "MCE: The hardware reports a non " - "fatal, correctable incident occurred on " - "CPU %d.\n", + if (!(high & (1<<31))) + continue; + + printk(KERN_INFO "MCE: The hardware reports a non fatal, " + "correctable incident occurred on CPU %d.\n", smp_processor_id()); - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time. - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } + + printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + + /* + * Scrub the error so we don't pick it up in MCE_RATE + * seconds time: + */ + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize: */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); } } @@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void) /* Some Athlons misbehave when we frob bank 0 */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; + boot_cpu_data.x86 == 6) + firstbank = 1; else - firstbank = 0; + firstbank = 0; /* * Check for non-fatal errors every MCE_RATE s */ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; } module_init(init_nonfatal_mce_checker); -- cgit v1.2.3 From cb6f3c155b0afabc48667efb9e7b1ce92ccfcab4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: x86, mce: clean up therm_throt.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 74 +++++++++++++++++--------------- 1 file changed, 39 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b9..a2b5d7ddb19f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@ /* - * * Thermal throttle event support code (such as syslog messaging and rate * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * * This allows consistent reporting of CPU thermal throttle events. * * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,44 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ - +#include +#include #include #include #include -#include -#include -#include + #include +#include /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) +#define CHECK_INTERVAL (300 * HZ) static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); + +atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) - -#define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ - else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ } define_therm_throt_sysdev_show_func(count); @@ -61,8 +62,8 @@ static struct attribute *thermal_throttle_attrs[] = { }; static struct attribute_group thermal_throttle_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ @@ -110,10 +111,11 @@ int therm_throt_process(int curr) } #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return sysfs_create_group(&sys_dev->kobj, + &thermal_throttle_attr_group); } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +123,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */ static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; int err = 0; sys_dev = get_cpu_sysdev(cpu); + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: -- cgit v1.2.3 From 1cb2a8e1767ab60370ecce90654c0f281c602d95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: clean up mce_amd_64.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 188 +++++++++++++++++--------------- 1 file changed, 103 insertions(+), 85 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 56dde9c4bc96..4d90ec3eb51d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -13,22 +13,22 @@ * * All MC4_MISCi registers are shared between multi-cores */ - -#include -#include -#include #include -#include #include -#include -#include +#include #include +#include +#include #include +#include +#include +#include + +#include #include +#include #include #include -#include -#include #define PFX "mce_threshold: " #define VERSION "version 1.1.1" @@ -48,26 +48,26 @@ #define MCG_XBLK_ADDR 0xC0000400 struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; }; /* defaults used early on boot */ static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, }; struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; }; static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); */ struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; + struct threshold_block *b; + int reset; + u16 old_limit; }; /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - unsigned int bank, block; unsigned int cpu = smp_processor_id(); - u8 lvt_off; u32 low = 0, high = 0, address = 0; + unsigned int bank, block; struct thresh_restart tr; + u8 lvt_off; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else ++address; if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) */ static void amd_threshold_interrupt(void) { + u32 low = 0, high = 0, address = 0; unsigned int bank, block; struct mce m; - u32 low = 0, high = 0, address = 0; mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) + if (block == 0) { address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { + } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else { ++address; + } if (rdmsr_safe(address, &low, &high)) break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* Log the machine check that caused the threshold - event. */ + /* + * Log the machine check that caused the threshold + * event. + */ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,56 @@ static void amd_threshold_interrupt(void) struct threshold_attr { struct attribute attr; - ssize_t(*show) (struct threshold_block *, char *); - ssize_t(*store) (struct threshold_block *, const char *, size_t count); + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); }; -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + b->interrupt_enable = !!new; - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } -static ssize_t store_threshold_limit(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (new > THRESHOLD_MAX) new = THRESHOLD_MAX; if (new < 1) new = 1; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; @@ -307,8 +317,8 @@ static ssize_t store_threshold_limit(struct threshold_block *b, } struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; + struct threshold_block *tb; + long retval; }; static void local_error_count_handler(void *_tbcc) @@ -338,15 +348,16 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) +#define RW_ATTR(name) \ +static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); @@ -359,15 +370,17 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; } @@ -377,18 +390,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; } static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, + .show = show, + .store = store, }; static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, }; static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +411,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, unsigned int block, u32 address) { - int err; - u32 low, high; struct threshold_block *b = NULL; + u32 low, high; + int err; if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; @@ -421,20 +436,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if (!b) return -ENOMEM; - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { list_add(&b->miscj, &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - else + } else { per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +463,9 @@ recurse: if (!address) return 0; address += MCG_XBLK_ADDR; - } else + } else { ++address; + } err = allocate_threshold_blocks(cpu, bank, ++block, address); if (err) @@ -507,6 +524,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) cpumask_copy(b->cpus, cpu_core_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; + goto out; } #endif @@ -605,15 +623,13 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { - int i = 0; struct threshold_bank *b; char name[32]; + int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) return; - if (!b->blocks) goto free_out; @@ -624,6 +640,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; + return; } #endif @@ -659,8 +676,8 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, - unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { if (cpu >= NR_CPUS) return; @@ -686,11 +703,12 @@ static __init int threshold_init_device(void) /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { int err = threshold_create_device(lcpu); + if (err) return err; } threshold_cpu_callback = amd_64_threshold_cpu_callback; + return 0; } - device_initcall(threshold_init_device); -- cgit v1.2.3 From 6cc6f3ebd19fea722c19630af5ad68af7f51d493 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: x86, mce: unify Intel thermal init, prepare Prepare for unification, make two intel_init_thermal equal. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 35 +++++++++++++----------- arch/x86/kernel/cpu/mcheck/p4.c | 44 ++++++++++++++----------------- 2 files changed, 40 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744b..b85d0c107c84 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -34,21 +34,22 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } +static inline void intel_set_thermal_handler(void) { } + static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; - int tm2 = 0; unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return; - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); @@ -61,31 +62,35 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); return; } - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + intel_set_thermal_handler(); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb344aba479c..f70753a443bb 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,19 +66,21 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } +static void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; +} + /* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); + int tm2 = 0; u32 l, h; - /* Thermal monitoring: */ - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; /* -ENODEV */ - - /* Clock modulation: */ - if (!cpu_has(c, X86_FEATURE_ACC)) - return; /* -ENODEV */ + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; /* * First check if its enabled already, in which case there might @@ -90,35 +92,28 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); - - return; /* -EBUSY */ + return; } - /* Check whether a vector already exists, temporarily masked? */ + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n", cpu, (h & APIC_VECTOR_MASK)); - - return; /* -EBUSY */ + return; } - /* - * The temperature transition interrupt handler setup: - */ - - /* Our delivery vector: */ - h = THERMAL_APIC_VECTOR; - /* We'll mask the thermal vector in the lapic till we're ready: */ - h |= APIC_DM_FIXED | APIC_LVT_MASKED; + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - /* Ok, we're good to go... */ - vendor_thermal_interrupt = intel_thermal_interrupt; + intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); @@ -127,7 +122,8 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); -- cgit v1.2.3 From a65d086235208a3b3546e209d2210048549099b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: x86, mce: unify Intel thermal init Mechanic unification. No change in code. [ Impact: cleanup, 32-bit / 64-bit unification ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 3 +- arch/x86/kernel/cpu/mcheck/mce.h | 11 +++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 61 +------------------------- arch/x86/kernel/cpu/mcheck/p4.c | 59 +------------------------ 5 files changed, 89 insertions(+), 118 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe8..6def76942bf2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,8 @@ obj-y = mce_$(BITS).o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o +obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index ae9f628838f1..2d1a54bdadfc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,6 +1,8 @@ #include #include +#ifdef CONFIG_X86_32 + void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); @@ -12,3 +14,12 @@ extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; +void intel_set_thermal_handler(void); + +#else + +static inline void intel_set_thermal_handler(void) { } + +#endif + +void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 000000000000..bad3cbb0e566 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,73 @@ +/* + * Common code for Intel machine checks + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mce.h" + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index b85d0c107c84..38f9632306fa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -17,6 +17,8 @@ #include #include +#include "mce.h" + asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; @@ -34,65 +36,6 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } -static inline void intel_set_thermal_handler(void) { } - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f70753a443bb..f979ffea330b 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,68 +66,11 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -static void intel_set_thermal_handler(void) +void intel_set_thermal_handler(void) { vendor_thermal_interrupt = intel_thermal_interrupt; } -/* P4/Xeon Thermal regulation detect and init: */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} #endif /* CONFIG_X86_MCE_P4THERMAL */ /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -- cgit v1.2.3 From 06b851d98266b812b2fa23d007cdf53f41194bbb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:25 +0200 Subject: x86, mce: unify, prepare 64bit in mce.h Prepare mce.h for unification, so that it will build on 32-bit x86 kernels too. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4f8c199584e7..8488210b866f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_MCE_H #define _ASM_X86_MCE_H -#ifdef __x86_64__ - #include #include @@ -10,21 +8,21 @@ * Machine Check support for x86 */ -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ -#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ -#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -#define MCI_STATUS_VAL (1UL<<63) /* valid error */ -#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1UL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ +#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ +#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ +#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ /* Fields are zero when not available */ struct mce { @@ -82,13 +80,11 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -#endif /* __x86_64__ */ - #ifdef __KERNEL__ #ifdef CONFIG_X86_32 extern int mce_disabled; -#else /* CONFIG_X86_32 */ +#endif #include @@ -143,8 +139,6 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); -#endif /* !CONFIG_X86_32 */ - #ifdef CONFIG_X86_MCE extern void mcheck_init(struct cpuinfo_x86 *c); #else -- cgit v1.2.3 From a988d334ae8213c0e0e62327222f6e5e6e52bcf1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:25 +0200 Subject: x86, mce: unify, prepare codes Move current 32-bit mce_32.c code into mce_64.c. [ Remove unused artifact stop/restart_mce pointed by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 65 +++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1491246c4d69..ce48ae75e1dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1240,3 +1240,68 @@ static __init int mce_init_device(void) } device_initcall(mce_init_device); + +#ifdef CONFIG_X86_32 + +int mce_disabled; + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled == 1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 711c2e481c9d1113650d09de10f61ee88ab56fda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: x86, mce: unify, prepare for 32-bit v2 Prepare the 64-bit mce_64.c code side to be built on 32-bit. [ includes ifdef relocation by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.h | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 2d1a54bdadfc..cd6cffcc2de0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,14 +1,14 @@ #include #include -#ifdef CONFIG_X86_32 - void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); +#ifdef CONFIG_X86_32 + /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index ce48ae75e1dc..2e2c3d2e9586 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,6 +37,10 @@ #include #include +#include "mce.h" + +#ifdef CONFIG_X86_64 + #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; @@ -1241,7 +1245,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#ifdef CONFIG_X86_32 +#else /* CONFIG_X86_32: */ int mce_disabled; -- cgit v1.2.3 From dba3725d44f5dfb5711fd509fca10b5b828c43b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: x86, mce: unify move mce_64.c => mce.c and glue it up in the Makefile. Remove mce_32.c Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 1311 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_32.c | 77 -- arch/x86/kernel/cpu/mcheck/mce_64.c | 1311 ----------------------------------- 4 files changed, 1312 insertions(+), 1389 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_32.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 6def76942bf2..55f01b39a105 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce_$(BITS).o therm_throt.o +obj-y = mce.o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 000000000000..2e2c3d2e9586 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,1311 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mce.h" + +#ifdef CONFIG_X86_64 + +#define MISC_MCELOG_MINOR 227 + +atomic_t mce_entry; + +static int mce_dont_init; + +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ +static int tolerant = 1; +static int banks; +static u64 *bank; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = -1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + + atomic_inc(&mce_events); + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + for (;;) { + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + return; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + set_bit(0, ¬ify_user); +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { + printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->ip); + printk("\n"); + } + printk(KERN_EMERG "TSC %llx ", m->tsc); + if (m->addr) + printk("ADDR %llx ", m->addr); + if (m->misc) + printk("MISC %llx ", m->misc); + printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG "Run through mcelog --ascii to decode " + "and contact your hardware vendor\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + panic(msg); +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mce_dont_init) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + } else { + m->ip = 0; + m->cs = 0; + } + if (rip_msr) { + /* Assume the RIP in the MSR is exact. Is this true? */ + m->mcgstatus |= MCG_STATUS_EIPV; + rdmsrl(rip_msr, m->ip); + m->cs = 0; + } +} + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i] || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + if (!(flags & MCP_DONTLOG)) { + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + } + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ + struct mce m, panicm; + int panicm_found = 0; + u64 mcestart = 0; + int i; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + + atomic_inc(&mce_entry); + + if (notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + goto out2; + if (!banks) + goto out2; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + /* if the restart IP is not valid, we're done for */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + no_way_out = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + + if (m.status & MCI_STATUS_EN) { + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + mce_get_rip(&m, regs); + mce_log(&m); + + /* + * Did this bank cause the exception? + * + * Assume that the bank with uncorrectable errors did it, + * and that there is only a single one: + */ + if ((m.status & MCI_STATUS_UC) && + (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + } + + /* + * If we didn't find an uncorrectable error, pick + * the last one (shouldn't happen, just being safe). + */ + if (!panicm_found) + panicm = m; + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) + mce_panic("Machine check", &panicm, mcestart); + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { + int user_space = 0; + + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) + user_space = panicm.ip && (panicm.cs & 3); + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * force_sig() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { + force_sig(SIGBUS, current); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } + } + + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ + struct mce m; + + mce_setup(&m); + m.bank = MCE_THERMAL_BANK; + m.status = status; + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static int check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static void mcheck_timer(unsigned long data) +{ + struct timer_list *t = &per_cpu(mce_timer, data); + int *n; + + WARN_ON(smp_processor_id() != data); + + if (mce_available(¤t_cpu_data)) { + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. + */ + n = &__get_cpu_var(next_interval); + if (mce_notify_user()) { + *n = max(*n/2, HZ/100); + } else { + *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + } + + t->expires = jiffies + *n; + add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_user(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + clear_thread_flag(TIF_MCE_NOTIFY); + + if (test_and_clear_bit(0, ¬ify_user)) { + wake_up_interruptible(&mce_wait); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) + printk(KERN_INFO "Machine check events logged\n"); + + return 1; + } + return 0; +} + +/* see if the idle task needs to notify userspace: */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, + void *unused) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; +} + +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; + +static __init int periodic_mcheck_init(void) +{ + idle_notifier_register(&mce_idle_notifier); + return 0; +} +__initcall(periodic_mcheck_init); + +/* + * Initialize Machine Checks for a CPU. + */ +static int mce_cap_init(void) +{ + unsigned b; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + return 0; +} + +static void mce_init(void *dummy) +{ + mce_banks_t all_banks; + u64 cap; + int i; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + + set_in_cr4(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&bank[4]); + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + mce_bootlog = 0; + } + } + +} + +static void mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + int *n = &__get_cpu_var(next_interval); + + *n = check_interval * HZ; + if (!*n) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies(jiffies + *n); + add_timer(t); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ + if (!mce_available(c)) + return; + + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + + mce_init(NULL); + mce_cpu_features(c); + mce_init_timer(); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + lock_kernel(); + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + unlock_kernel(); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + unlock_kernel(); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static DEFINE_MUTEX(mce_read_mutex); + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, + loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_read_mutex); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return -EINVAL; + } + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, + sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 1; +} +__setup("nomce", mcheck_disable); + +/* + * mce=off disables machine check + * mce=TOLERANCELEVEL (number, see above) + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ +static int __init mcheck_enable(char *str) +{ + if (!strcmp(str, "off")) + mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); + else if (isdigit(str[0])) + get_option(&str, &tolerant); + else { + printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + str); + return 0; + } + return 1; +} +__setup("mce=", mcheck_enable); + +/* + * Sysfs support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static int mce_resume(struct sys_device *dev) +{ + mce_init(NULL); + mce_cpu_features(¤t_cpu_data); + + return 0; +} + +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + on_each_cpu(mce_cpu_restart, NULL, 1); +} + +static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", +}; + +DEFINE_PER_CPU(struct sys_device, device_mce); + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + \ + if (end == buf) \ + return -EINVAL; \ + var = new; \ + start; \ + \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + + return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + + if (end == buf) + return -EINVAL; + + bank[attr - bank_attrs] = new; + mce_restart(); + + return end-buf; +} + +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + int len; + + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + + if (*p) + *p = 0; + + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); + +ACCESSOR(check_interval, check_interval, mce_restart()) + +static struct sysdev_attribute *mce_attributes[] = { + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(device_mce, cpu).id = cpu; + per_cpu(device_mce, cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce, cpu)); + if (err) + return err; + + for (i = 0; mce_attributes[i]; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + if (err) + goto error; + } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + + return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } +error: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + } + sysdev_unregister(&per_cpu(device_mce, cpu)); + + return err; +} + +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_attributes[i]; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + sysdev_unregister(&per_cpu(device_mce, cpu)); + cpumask_clear_cpu(cpu, mce_device_initialized); +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void mce_reenable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(¤t_cpu_data)) + return; + + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + mce_remove_device(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies(jiffies + + __get_cpu_var(next_interval)); + add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier __cpuinitdata = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + + return -ENOMEM; +} + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + + err = mce_init_banks(); + if (err) + return err; + + err = sysdev_class_register(&mce_sysclass); + if (err) + return err; + + for_each_online_cpu(i) { + err = mce_create_device(i); + if (err) + return err; + } + + register_hotcpu_notifier(&mce_cpu_notifier); + misc_register(&mce_log_device); + + return err; +} + +device_initcall(mce_init_device); + +#else /* CONFIG_X86_32: */ + +int mce_disabled; + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled == 1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 05979e7eff12..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox , Dave Jones - */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -int mce_disabled; - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 2e2c3d2e9586..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null @@ -1,1311 +0,0 @@ -/* - * Machine check handler. - * - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "mce.h" - -#ifdef CONFIG_X86_64 - -#define MISC_MCELOG_MINOR 227 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { - [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); - rdtscll(m->tsc); -} - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - - atomic_inc(&mce_events); - mce->finished = 0; - wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - for (;;) { - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->ip); - printk("\n"); - } - printk(KERN_EMERG "TSC %llx ", m->tsc); - if (m->addr) - printk("ADDR %llx ", m->addr); - if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - panic(msg); -} - -int mce_available(struct cpuinfo_x86 *c) -{ - if (mce_dont_init) - return 0; - return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); - m->cs = 0; - } -} - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ - struct mce m; - int i; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. - * - * TBD do the same check for MCI_STATUS_EN here? - */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) - continue; - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - /* - * Don't get the IP here because it's unlikely to - * have anything to do with the actual error location. - */ - if (!(flags & MCP_DONTLOG)) { - mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); - } - - /* - * Clear state for this bank. - */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - - /* - * Don't clear MCG_STATUS here because it's only defined for - * exceptions. - */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - */ -void do_machine_check(struct pt_regs *regs, long error_code) -{ - struct mce m, panicm; - int panicm_found = 0; - u64 mcestart = 0; - int i; - /* - * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. - */ - int no_way_out = 0; - /* - * If kill_it gets set, there might be a way to recover from this - * error. - */ - int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - - atomic_inc(&mce_entry); - - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out2; - if (!banks) - goto out2; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - __clear_bit(i, toclear); - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. - */ - if ((m.status & MCI_STATUS_UC) == 0) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK); - - __set_bit(i, toclear); - - if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; - kill_it = 1; - } - } else { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - mce_log(&m); - - /* - * Did this bank cause the exception? - * - * Assume that the bank with uncorrectable errors did it, - * and that there is only a single one: - */ - if ((m.status & MCI_STATUS_UC) && - (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - } - - /* - * If we didn't find an uncorrectable error, pick - * the last one (shouldn't happen, just being safe). - */ - if (!panicm_found) - panicm = m; - - /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - */ - if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); - - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); - } - } - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: - atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll 2x faster. When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ -static int check_interval = 5 * 60; /* 5 minutes */ - -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static void mcheck_timer(unsigned long data) -{ - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; - - WARN_ON(smp_processor_id() != data); - - if (mce_available(¤t_cpu_data)) { - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); - } - - /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. - */ - n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { - *n = max(*n/2, HZ/100); - } else { - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } - - t->expires = jiffies + *n; - add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_user(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - clear_thread_flag(TIF_MCE_NOTIFY); - - if (test_and_clear_bit(0, ¬ify_user)) { - wake_up_interruptible(&mce_wait); - - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (trigger[0] && !work_pending(&mce_trigger_work)) - schedule_work(&mce_trigger_work); - - if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); - - return 1; - } - return 0; -} - -/* see if the idle task needs to notify userspace: */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, - void *unused) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - -/* - * Initialize Machine Checks for a CPU. - */ -static int mce_cap_init(void) -{ - unsigned b; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; - if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); - b = MAX_NR_BANKS; - } - - /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); - } - - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - return 0; -} - -static void mce_init(void *dummy) -{ - mce_banks_t all_banks; - u64 cap; - int i; - - /* - * Log the machine checks left over from the previous reset. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); - - set_in_cr4(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&bank[4]); - } - if (c->x86 <= 17 && mce_bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - mce_bootlog = 0; - } - } - -} - -static void mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - case X86_VENDOR_AMD: - mce_amd_feature_init(c); - break; - default: - break; - } -} - -static void mce_init_timer(void) -{ - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); - - *n = check_interval * HZ; - if (!*n) - return; - setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies(jiffies + *n); - add_timer(t); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off: - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ - if (!mce_available(c)) - return; - - if (mce_cap_init() < 0) { - mce_dont_init = 1; - return; - } - mce_cpu_quirks(c); - - mce_init(NULL); - mce_cpu_features(c); - mce_init_timer(); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ - lock_kernel(); - spin_lock(&mce_state_lock); - - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); - unlock_kernel(); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; - - spin_unlock(&mce_state_lock); - unlock_kernel(); - - return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_state_lock); - - open_count--; - open_exclu = 0; - - spin_unlock(&mce_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static DEFINE_MUTEX(mce_read_mutex); - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 1; -} -__setup("nomce", mcheck_disable); - -/* - * mce=off disables machine check - * mce=TOLERANCELEVEL (number, see above) - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - * mce=nobootlog Don't log MCEs from before booting. - */ -static int __init mcheck_enable(char *str) -{ - if (!strcmp(str, "off")) - mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); - else if (isdigit(str[0])) - get_option(&str, &tolerant); - else { - printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", - str); - return 0; - } - return 1; -} -__setup("mce=", mcheck_enable); - -/* - * Sysfs support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ - int i; - - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); - return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ - return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ - return mce_disable(); -} - -/* - * On resume clear all MCE state. Don't want to see leftovers from the BIOS. - * Only one CPU is active at this time, the others get re-added later using - * CPU hotplug: - */ -static int mce_resume(struct sys_device *dev) -{ - mce_init(NULL); - mce_cpu_features(¤t_cpu_data); - - return 0; -} - -static void mce_cpu_restart(void *data) -{ - del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(NULL); - mce_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - on_each_cpu(mce_cpu_restart, NULL, 1); -} - -static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); - -__cpuinitdata -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - \ - if (end == buf) \ - return -EINVAL; \ - var = new; \ - start; \ - \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) -{ - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *end; - u64 new = simple_strtoull(buf, &end, 0); - - if (end == buf) - return -EINVAL; - - bank[attr - bank_attrs] = new; - mce_restart(); - - return end-buf; -} - -static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) -{ - strcpy(buf, trigger); - strcat(buf, "\n"); - return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - int len; - - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); - - if (*p) - *p = 0; - - return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); - -ACCESSOR(check_interval, check_interval, mce_restart()) - -static struct sysdev_attribute *mce_attributes[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, - NULL -}; - -static cpumask_var_t mce_device_initialized; - -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ - int err; - int i; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce, cpu).id = cpu; - per_cpu(device_mce, cpu).cls = &mce_sysclass; - - err = sysdev_register(&per_cpu(device_mce, cpu)); - if (err) - return err; - - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - if (err) - goto error; - } - for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - if (err) - goto error2; - } - cpumask_set_cpu(cpu, mce_device_initialized); - - return 0; -error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } -error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce, cpu)); - - return err; -} - -static __cpuinit void mce_remove_device(unsigned int cpu) -{ - int i; - - if (!cpumask_test_cpu(cpu, mce_device_initialized)) - return; - - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce, cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ - unsigned long action = *(unsigned long *)h; - int i; - - if (!mce_available(¤t_cpu_data)) - return; - - if (!(action & CPU_TASKS_FROZEN)) - cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); - add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - break; - case CPU_POST_DEAD: - /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier __cpuinitdata = { - .notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_banks(void) -{ - int i; - - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; - - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; - - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; - } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; -} - -static __init int mce_init_device(void) -{ - int err; - int i = 0; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); - - err = mce_init_banks(); - if (err) - return err; - - err = sysdev_class_register(&mce_sysclass); - if (err) - return err; - - for_each_online_cpu(i) { - err = mce_create_device(i); - if (err) - return err; - } - - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); - - return err; -} - -device_initcall(mce_init_device); - -#else /* CONFIG_X86_32: */ - -int mce_disabled; - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From cb491fca55e5282f0a95ef39c55352e00d6ca75e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: x86, mce: Rename sysfs variables Shorten variable names. This also compacts the code a bit. device_mce => mce_dev mce_device_initialized => mce_dev_initialized mce_attribute => mce_attrs [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 58 +++++++++++++++------------------ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 10 +++--- 3 files changed, 33 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8488210b866f..b9972a6bc2a1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -90,7 +90,7 @@ extern int mce_disabled; void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, device_mce); +DECLARE_PER_CPU(struct sys_device, mce_dev); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2e2c3d2e9586..ba8dd41a10dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -957,7 +957,7 @@ static struct sysdev_class mce_sysclass = { .name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -1039,12 +1039,12 @@ static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval, check_interval, mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { +static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; -static cpumask_var_t mce_device_initialized; +static cpumask_var_t mce_dev_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -1055,40 +1055,36 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce, cpu).id = cpu; - per_cpu(device_mce, cpu).cls = &mce_sysclass; + memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(mce_dev, cpu).id = cpu; + per_cpu(mce_dev, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce, cpu)); + err = sysdev_register(&per_cpu(mce_dev, cpu)); if (err) return err; - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) { + err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); if (err) goto error; } for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), + err = sysdev_create_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_device_initialized); + cpumask_set_cpu(cpu, mce_dev_initialized); return 0; error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce, cpu)); + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); return err; } @@ -1097,24 +1093,24 @@ static __cpuinit void mce_remove_device(unsigned int cpu) { int i; - if (!cpumask_test_cpu(cpu, mce_device_initialized)) + if (!cpumask_test_cpu(cpu, mce_dev_initialized)) return; - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce, cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); + cpumask_clear_cpu(cpu, mce_dev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ static void mce_disable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; @@ -1221,7 +1217,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 4d90ec3eb51d..083f270251fa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -517,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, b->kobj, name); if (err) goto out; @@ -540,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); if (!b->kobj) goto out_free; @@ -560,7 +560,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, b->kobj, name); if (err) goto out; @@ -638,7 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -650,7 +650,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.2.3 From b659294b779565c60f5e12ef505328e2b974eb62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:27 +0200 Subject: x86, mce: print number of MCE banks The number of MCE banks supported by a CPU is a useful number to know, so print it out during CPU initialization. [ Impact: add printout ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba8dd41a10dc..49c74222359d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -570,6 +570,8 @@ static int mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (b > MAX_NR_BANKS) { printk(KERN_WARNING "MCE: Using only %u machine check banks out of %u\n", @@ -1287,6 +1289,7 @@ void mcheck_init(struct cpuinfo_x86 *c) default: break; } + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } static int __init mcheck_disable(char *str) -- cgit v1.2.3 From ba2d0f2b0c56d7174a0208f7c463271f39040728 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: x86, mce: Cleanup symbols in intel thermal codes Decode magic constants and turn them into symbols. [ Cleanup to use symbols already exists - HS ] [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 9 +++++---- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/cpu/mcheck/p4.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c167..c86404695083 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -208,7 +208,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b + +#define THERM_INT_LOW_ENABLE (1 << 0) +#define THERM_INT_HIGH_ENABLE (1 << 1) + #define MSR_IA32_THERM_STATUS 0x0000019c + +#define THERM_STATUS_PROCHOT (1 << 0) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index bad3cbb0e566..2b011d2d8579 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -32,13 +32,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; /* Check whether a vector already exists */ @@ -54,12 +54,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 38f9632306fa..13abafcb72e4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -29,7 +29,7 @@ asmlinkage void smp_thermal_interrupt(void) irq_enter(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & 1)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f979ffea330b..82cee108a2d3 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & 0x1); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT); } /* Thermal interrupt handler for this CPU setup: */ -- cgit v1.2.3 From 01c6680a547a3ee8dd170c269ea8e037b3191b71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: x86, mce: Cleanup MCG definitions Decode more magic constants and turn them into symbols. [ Sort definitions bitwise, introduce MCG_EXT_CNT - HS ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 +++++++--- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b9972a6bc2a1..94aedaf6327f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -8,9 +8,13 @@ * Machine Check support for x86 */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ +#define MCG_EXT_CNT_SHIFT 16 +#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 49c74222359d..147333627416 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -569,7 +569,8 @@ static int mce_cap_init(void) u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; + + b = cap & MCG_BANKCNT_MASK; printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -590,7 +591,7 @@ static int mce_cap_init(void) } /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; return 0; -- cgit v1.2.3 From 3cde5c8c839bf46a7be799ed0e1d0b4780aaf794 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:01:31 +0200 Subject: x86, mce: initial steps to make 64bit mce code 32bit clean Replace unsigned long with u64s if they need to contain 64bit values. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 147333627416..cd1313b47506 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -156,15 +156,15 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, unsigned long start) +static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; + u64 tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) + if ((s64)(tsc - start) < 0) continue; print_mce(&mcelog.entry[i]); if (backup && mcelog.entry[i].tsc == backup->tsc) @@ -970,13 +970,13 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static ssize_t show_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ + return sprintf(buf, "%Lx\n", (u64)var); \ } \ static ssize_t set_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ const char *buf, size_t siz) { \ char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ + u64 new = simple_strtoull(buf, &end, 0); \ \ if (end == buf) \ return -EINVAL; \ -- cgit v1.2.3 From 06b7a7a5ec917761969444fee967c43868a76468 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:37:43 +0200 Subject: x86, mce: implement the PPro bank 0 quirk in the 64bit machine check code Quoting the comment: * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled * register. * But it's not aliased anymore on model 0x1a+ * Don't ignore bank 0 completely because there could be a valid * event later, merely don't write CTL0. This is mostly a port on the 32bit code, except that 32bit always didn't write it and didn't have the 0x1a heuristic. I checked with the CPU designers that the quirk is not required starting with this model. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd1313b47506..1dcd3be03328 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -65,6 +65,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static unsigned long dont_init_banks; + static DECLARE_WAIT_QUEUE_HEAD(mce_wait); /* MCA banks polled by the period polling timer for corrected events */ @@ -72,6 +74,11 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +static inline int skip_bank_init(int i) +{ + return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -616,6 +623,8 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { + if (skip_bank_init(i)) + continue; wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } @@ -643,6 +652,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A) + __set_bit(0, &dont_init_banks); + } } static void mce_cpu_features(struct cpuinfo_x86 *c) @@ -911,8 +933,10 @@ static int mce_disable(void) { int i; - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } return 0; } @@ -1119,8 +1143,10 @@ static void mce_disable_cpu(void *h) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } } static void mce_reenable_cpu(void *h) @@ -1133,8 +1159,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + } } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -- cgit v1.2.3 From 2e6f694fde0a7158590e121962ca2e3c06633528 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:42:48 +0200 Subject: x86, mce: port K7 bank 0 quirk to 64bit mce code Various K7 have broken bank 0s. Don't enable it by default Port from the 32bit code. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1dcd3be03328..1336280edcc2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -650,6 +650,12 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) */ mce_bootlog = 0; } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6) + bank[0] = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { -- cgit v1.2.3 From 5d7279268b654d1f8ac43b0eb6cd9598d9cf55fd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 19:25:48 +0200 Subject: x86, mce: use a call vector to call the 64bit mce handler Allows to call different machine check handlers from the low level machine check entry vector. This is needed for later when it will be used for 32bit too. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ++++++++++++----------- arch/x86/kernel/cpu/mcheck/mce.h | 3 ++- arch/x86/kernel/entry_64.S | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1336280edcc2..d99318b470d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -39,6 +39,16 @@ #include "mce.h" +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 @@ -715,6 +725,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) } mce_cpu_quirks(c); + machine_check_vector = do_machine_check; + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -1285,17 +1297,6 @@ int mce_disabled; int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index cd6cffcc2de0..966ae3c5cb11 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -7,11 +7,12 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_32 /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); +#ifdef CONFIG_X86_32 + extern int nr_mce_banks; void intel_set_thermal_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e8433..63276c45bffa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1382,7 +1382,7 @@ paranoiderrorentry stack_segment do_stack_segment errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip) #endif /* -- cgit v1.2.3 From 04b2b1a4df6cd0fdaa598f3c623a19c2d93cb48a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 22:50:19 +0200 Subject: x86, mce: rename 64bit mce_dont_init to mce_disabled Give it the same name as on 32bit. This makes further merging easier. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 15 +++++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 94aedaf6327f..c3c7ee701753 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -86,9 +86,7 @@ struct mce_log { #ifdef __KERNEL__ -#ifdef CONFIG_X86_32 extern int mce_disabled; -#endif #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d99318b470d8..6ab477060f56 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -49,14 +49,15 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) /* Call the installed machine check handler for this CPU setup. */ void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +int mce_disabled; + #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; - /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -194,7 +195,7 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) int mce_available(struct cpuinfo_x86 *c) { - if (mce_dont_init) + if (mce_disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -720,7 +721,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) return; if (mce_cap_init() < 0) { - mce_dont_init = 1; + mce_disabled = 1; return; } mce_cpu_quirks(c); @@ -911,7 +912,7 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_disable(char *str) { - mce_dont_init = 1; + mce_disabled = 1; return 1; } __setup("nomce", mcheck_disable); @@ -925,7 +926,7 @@ __setup("nomce", mcheck_disable); static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) - mce_dont_init = 1; + mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) @@ -1292,8 +1293,6 @@ device_initcall(mce_init_device); #else /* CONFIG_X86_32: */ -int mce_disabled; - int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -- cgit v1.2.3 From d7c3c9a609563868d8a70e220399d06a25aba095 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:07:25 +0200 Subject: x86, mce: move mce_disabled option into common 32bit/64bit code It's the same function, so let's share it. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6ab477060f56..5395200dc9d9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -907,16 +907,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} -__setup("nomce", mcheck_disable); - /* * mce=off disables machine check * mce=TOLERANCELEVEL (number, see above) @@ -1327,19 +1317,22 @@ void mcheck_init(struct cpuinfo_x86 *c) printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - static int __init mcheck_enable(char *str) { mce_disabled = -1; return 1; } -__setup("nomce", mcheck_disable); __setup("mce", mcheck_enable); -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); -- cgit v1.2.3 From 8e97aef5f43ec715f394bc15015ff263b80c3ad6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:23:18 +0200 Subject: x86, mce: remove machine check handler idle notify on 64bit i386 has no idle notifiers, but the 64bit machine check code uses them to wake up mcelog from a fatal machine check exception. For corrected machine checks found by the poller or threshold interrupts going through an idle notifier is not needed because the wake_up can is just done directly and doesn't need the idle notifier. It is only needed for logging exceptions. To be honest I never liked the idle notifier even though I signed off on it. On closer investigation the code actually turned out to be nearly. Right now machine check exceptions on x86 are always unrecoverable (lead to panic due to PCC), which means we never execute the idle notifier path. The only exception is the somewhat weird tolerant==3 case, which ignores PCC. I'll fix this in a future patch in a much cleaner way. So remove the "mcelog wakeup through idle notifier" code from 64bit. This allows to compile the 64bit machine check handler on 32bit which doesn't have idle notifiers. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5395200dc9d9..7562c1f674f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -555,29 +555,6 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace: */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, - void *unused) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ -- cgit v1.2.3 From d896a940ef4f12a0a6bc432853b249dcfbacabf0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:25:18 +0200 Subject: x86, mce: remove oops_begin() use in 64bit machine check First 32bit doesn't have oops_begin, so it's a barrier of using this code on 32bit. On closer examination it turns out oops_begin is not a good idea in a machine check panic anyways. All oops_begin does it so check for recursive/parallel oopses and implement the "wait on oops" heuristic. But there's actually no good reason to lock machine checks against oopses or prevent them from recursion. Also "wait on oops" does not really make sense for a machine check too. Replace it with a manual bust_spinlocks/console_verbose. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7562c1f674f3..f4d6841d2bdf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -178,7 +178,8 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; - oops_begin(); + bust_spinlocks(1); + console_verbose(); for (i = 0; i < MCE_LOG_LEN; i++) { u64 tsc = mcelog.entry[i].tsc; -- cgit v1.2.3 From 4efc0670baf4b14bc95502e54a83ccf639146125 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 19:07:31 +0200 Subject: x86, mce: use 64bit machine check code on 32bit The 64bit machine check code is in many ways much better than the 32bit machine check code: it is more specification compliant, is cleaner, only has a single code base versus one per CPU, has better infrastructure for recovery, has a cleaner way to communicate with user space etc. etc. Use the 64bit code for 32bit too. This is the second attempt to do this. There was one a couple of years ago to unify this code for 32bit and 64bit. Back then this ran into some trouble with K7s and was reverted. I believe this time the K7 problems (and some others) are addressed. I went over the old handlers and was very careful to retain all quirks. But of course this needs a lot of testing on old systems. On newer 64bit capable systems I don't expect much problems because they have been already tested with the 64bit kernel. I made this a CONFIG for now that still allows to select the old machine check code. This is mostly to make testing easier, if someone runs into a problem we can ask them to try with the CONFIG switched. The new code is default y for more coverage. Once there is confidence the 64bit code works well on older hardware too the CONFIG_X86_OLD_MCE and the associated code can be easily removed. This causes a behaviour change for 32bit installations. They now have to install the mcelog package to be able to log corrected machine checks. The 64bit machine check code only handles CPUs which support the standard Intel machine check architecture described in the IA32 SDM. The 32bit code has special support for some older CPUs which have non standard machine check architectures, in particular WinChip C3 and Intel P5. I made those a separate CONFIG option and kept them for now. The WinChip variant could be probably removed without too much pain, it doesn't really do anything interesting. P5 is also disabled by default (like it was before) because many motherboards have it miswired, but according to Alan Cox a few embedded setups use that one. Forward ported/heavily changed version of old patch, original patch included review/fixes from Thomas Gleixner, Bert Wesarg. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 33 +++++++++++++++++++++++++++++++-- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/apic.c | 4 ++-- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce.h | 18 +++++++++++++++--- arch/x86/kernel/cpu/mcheck/p5.c | 5 +++++ arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/signal.c | 4 ++-- arch/x86/kernel/traps.c | 4 ++-- 12 files changed, 92 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6efe0a2e9ae..c1c5ccd1937f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -789,6 +789,22 @@ config X86_MCE to disable it. MCE support simply ignores non-MCE processors like the 386 and 486, so nearly everyone can say Y here. +config X86_OLD_MCE + depends on X86_32 && X86_MCE + bool "Use legacy machine check code (will go away)" + default n + select X86_ANCIENT_MCE + ---help--- + Use the old i386 machine check code. This is merely intended for + testing in a transition period. Try this if you run into any machine + check related software problems, but report the problem to + linux-kernel. When in doubt say no. + +config X86_NEW_MCE + depends on X86_MCE + bool + default y if (!X86_OLD_MCE && X86_32) || X86_64 + config X86_MCE_INTEL def_bool y prompt "Intel MCE features" @@ -805,6 +821,15 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_ANCIENT_MCE + def_bool n + depends on X86_32 + prompt "Support for old Pentium 5 / WinChip machine checks" + ---help--- + Include support for machine check handling on old Pentium 5 or WinChip + systems. These typically need to be enabled explicitely on the command + line. + config X86_MCE_THRESHOLD depends on X86_MCE_AMD || X86_MCE_INTEL bool @@ -812,7 +837,7 @@ config X86_MCE_THRESHOLD config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_32 && X86_MCE + depends on X86_OLD_MCE ---help--- Enabling this feature starts a timer that triggers every 5 seconds which will look at the machine check registers to see if anything happened. @@ -825,11 +850,15 @@ config X86_MCE_NONFATAL config X86_MCE_P4THERMAL bool "check for P4 thermal throttling interrupt." - depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) + depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) ---help--- Enabling this feature will cause a message to be printed when the P4 enters thermal throttling. +config X86_THERMAL_VECTOR + def_bool y + depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + config VM86 bool "Enable VM86 support" if EMBEDDED default y diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf258..486c9e946f5c 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -52,7 +52,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) #endif -#ifdef CONFIG_X86_MCE_P4THERMAL +#ifdef CONFIG_X86_THERMAL_VECTOR BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f246..ad532289ef2e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -843,7 +843,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -1962,7 +1962,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a1..c4762276c17e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_NEW_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 55f01b39a105..5f8b09425d39 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,7 @@ obj-y = mce.o therm_throt.o -obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o +obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f4d6841d2bdf..e193de44ef19 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,7 +52,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_NEW_MCE #define MISC_MCELOG_MINOR 227 @@ -662,6 +662,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return; + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (mce_p5_enabled()) + intel_p5_mcheck_init(c); + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + break; + } +} + static void mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -695,6 +710,11 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { + if (mce_disabled) + return; + + mce_ancient_init(c); + if (!mce_available(c)) return; @@ -893,6 +913,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { + if (*str == 0) + enable_p5_mce(); + if (*str == '=') + str++; if (!strcmp(str, "off")) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -900,13 +924,13 @@ static int __init mcheck_enable(char *str) else if (isdigit(str[0])) get_option(&str, &tolerant); else { - printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; } -__setup("mce=", mcheck_enable); +__setup("mce", mcheck_enable); /* * Sysfs support @@ -1259,7 +1283,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_32: */ +#else /* CONFIG_X86_OLD_MCE: */ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 966ae3c5cb11..84a552b458c8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,17 +1,29 @@ #include #include +#ifdef CONFIG_X86_OLD_MCE void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); +#endif +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +extern int mce_p5_enable; +static inline int mce_p5_enabled(void) { return mce_p5_enable; } +static inline void enable_p5_mce(void) { mce_p5_enable = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline int mce_p5_enabled(void) { return 0; } +static inline void enable_p5_mce(void) { } +#endif /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 8812f5441830..015f481ab1b0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,6 +14,9 @@ #include "mce.h" +/* By default disabled */ +int mce_p5_enable; + /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { @@ -44,9 +47,11 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; +#ifdef CONFIG_X86_OLD_MCE /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; +#endif machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..35eddc9ec99e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -89,7 +89,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); @@ -176,7 +176,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #endif #ifdef CONFIG_X86_MCE sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f9..98846e03211e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -181,7 +181,7 @@ void __init native_init_IRQ(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +#ifdef CONFIG_X86_THERMAL_VECTOR /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..d0851e3f77eb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,11 +25,11 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include #include -#include #endif /* CONFIG_X86_64 */ #include @@ -857,7 +857,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..ad771f15bddd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -798,7 +798,8 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } -#else +#endif + asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -806,7 +807,6 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) { } -#endif /* * 'math_state_restore()' saves the current math information in the -- cgit v1.2.3 From 7856f6cce4a8cda8c1f94b99605c07d16b8d8dec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:32:56 +0200 Subject: x86, mce: enable MCE_INTEL for 32bit new MCE Enable the 64bit MCE_INTEL code (CMCI, thermal interrupts) for 32bit NEW_MCE. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/entry_arch.h | 4 ++++ arch/x86/include/asm/hardirq.h | 2 +- arch/x86/include/asm/irq_vectors.h | 5 +++-- arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit_32.c | 4 ++++ arch/x86/kernel/traps.c | 2 +- 8 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1c5ccd1937f..e1c9f77f69ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -808,7 +808,7 @@ config X86_NEW_MCE config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 486c9e946f5c..b2eb9c066843 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -56,4 +56,8 @@ BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_THRESHOLD +BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 37555e52f980..922ee7c29693 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -20,7 +20,7 @@ typedef struct { #endif #ifdef CONFIG_X86_MCE unsigned int irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; # endif #endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47c..451e24d18050 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -87,10 +87,11 @@ #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 + #ifdef CONFIG_X86_32 -/* 0xf8 - 0xf9 : free */ +/* 0xf9 : free */ #else -# define THRESHOLD_APIC_VECTOR 0xf9 # define UV_BAU_MESSAGE 0xf8 #endif diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f78..d746df2909c9 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void) { exit_idle(); irq_enter(); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63276c45bffa..a31a7f29cffe 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1007,7 +1007,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ #endif apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt mce_threshold_interrupt + threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 98846e03211e..2512ad93dabf 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -186,6 +186,10 @@ void __init native_init_IRQ(void) alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#ifdef CONFIG_X86_MCE_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ad771f15bddd..0d358c884b35 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -804,7 +804,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -- cgit v1.2.3 From de5619dfef76ddb403eb7c6de39c0130166c5dc3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:34:40 +0200 Subject: x86, mce: enable MCE_AMD for 32bit NEW_MCE That's very easy using the infrastructure enabled earlier for MCE_INTEL Untested. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e1c9f77f69ec..a148e7ac0d83 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -816,7 +816,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. -- cgit v1.2.3 From 5f8c1a54cab6f449fe04d42d0661bc796fa4e73e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:29:12 +0200 Subject: x86, mce: add MSR read wrappers for easier error injection This will be used by future patches to allow machine check error injection. Right now it's a nop, except for adding some wrappers around the MSR reads. This is early in the sequence to avoid too many conflicts. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e193de44ef19..e755c95674dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,6 +194,19 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + rdmsrl(msr, v); + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + wrmsrl(msr, v); +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -213,7 +226,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); + m->ip = mce_rdmsrl(rip_msr); m->cs = 0; } } @@ -231,7 +244,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { if (!bank[i] || !test_bit(i, *b)) continue; @@ -242,7 +255,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -257,9 +270,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -275,7 +288,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } /* @@ -320,7 +333,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -338,7 +351,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -378,9 +391,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); mce_get_rip(&m, regs); mce_log(&m); @@ -449,9 +462,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } - wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); } -- cgit v1.2.3 From ea149b36c7f511d17dd89fee734cb09778a91fa0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:31:00 +0200 Subject: x86, mce: add basic error injection infrastructure Allow user programs to write mce records into /dev/mcelog. When they do that a fake machine check is triggered to test the machine check code. This uses the MCE MSR wrappers added earlier. The implementation is straight forward. There is a struct mce record per CPU and the MCE MSR accesses get data from there if there is valid data injected there. This allows to test the machine check code relatively realistically because only the lowest layer of hardware access is intercepted. The test suite and injector are available at git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 8 ++ arch/x86/include/asm/mce.h | 3 + arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 126 ++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++++- 5 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-inject.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a148e7ac0d83..e25b6358fbe3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -835,6 +835,14 @@ config X86_MCE_THRESHOLD bool default y +config X86_MCE_INJECT + depends on X86_NEW_MCE + tristate "Machine check injector support" + ---help--- + Provide support for injecting machine checks for testing purposes. + If you don't know what a machine check is and you don't do kernel + QA it is safe to say n. + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_OLD_MCE diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c3c7ee701753..e7d2372301ef 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -141,6 +141,9 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); +DECLARE_PER_CPU(struct mce, injectm); +extern struct file_operations mce_chrdev_ops; + #ifdef CONFIG_X86_MCE extern void mcheck_init(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 5f8b09425d39..60ee182c6c52 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 000000000000..58afac4b5df5 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,126 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->cpu); + + /* Make sure noone reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->cpu = m->cpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +struct delayed_mce { + struct timer_list timer; + struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ + struct delayed_mce *dm = (struct delayed_mce *)data; + struct mce *m = &dm->m; + int cpu = m->cpu; + + inject_mce(m); + if (m->status & MCI_STATUS_UC) { + struct pt_regs regs; + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + do_machine_check(®s, 0); + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else { + mce_banks_t b; + memset(&b, 0xff, sizeof(mce_banks_t)); + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + machine_check_poll(0, &b); + mce_notify_user(); + printk(KERN_INFO "Finished machine check poll on CPU %d\n", + cpu); + } + kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct delayed_mce *dm; + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + return -EINVAL; + + dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); + if (!dm) + return -ENOMEM; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + * Should we use a hrtimer here for better synchronization? + */ + memcpy(&dm->m, &m, sizeof(struct mce)); + setup_timer(&dm->timer, raise_mce, (unsigned long)dm); + dm->timer.expires = jiffies + 2; + add_timer_on(&dm->timer, m.cpu); + return usize; +} + +static int inject_init(void) +{ + printk(KERN_INFO "Machine check injector initialized\n"); + mce_chrdev_ops.write = mce_write; + return 0; +} + +module_init(inject_init); +/* Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e755c95674dc..fe216bd10f43 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -98,6 +98,9 @@ void mce_setup(struct mce *m) rdtscll(m->tsc); } +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -194,16 +197,46 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) + return offsetof(struct mce, ip); + if (msr == MSR_IA32_MC0_STATUS + bank*4) + return offsetof(struct mce, status); + if (msr == MSR_IA32_MC0_ADDR + bank*4) + return offsetof(struct mce, addr); + if (msr == MSR_IA32_MC0_MISC + bank*4) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + /* MSR access wrappers used for error injection */ static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset < 0) + return 0; + return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + } rdmsrl(msr, v); return v; } static void mce_wrmsrl(u32 msr, u64 v) { + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + return; + } wrmsrl(msr, v); } @@ -296,6 +329,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * exceptions. */ } +EXPORT_SYMBOL_GPL(machine_check_poll); /* * The actual machine check handler. This only handles real @@ -468,6 +502,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** @@ -568,6 +603,7 @@ int mce_notify_user(void) } return 0; } +EXPORT_SYMBOL_GPL(mce_notify_user); /* * Initialize Machine Checks for a CPU. @@ -904,13 +940,14 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } -static const struct file_operations mce_chrdev_ops = { +struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, .read = mce_read, .poll = mce_poll, .unlocked_ioctl = mce_ioctl, }; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_log_device = { MISC_MCELOG_MINOR, -- cgit v1.2.3 From a1ff41bfc1bb7a6d19cf958f89a9b539678781e5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:16:14 -0700 Subject: x86, mce: add comment about mce_chrdev_ops being writable Add a comment explaining that mce_chrdev_ops is intentionally writable. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fe216bd10f43..156cdf6d9181 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -940,6 +940,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } +/* Modified in mce-inject.c, so not static or const */ struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, -- cgit v1.2.3 From 5706001aacba5d3db5f224ca135e5e91a30be39c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:18:17 -0700 Subject: x86, mce: fix comment style in mce-inject.c Fix style of winged comment in mce-inject.c. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 58afac4b5df5..673c72855022 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -120,7 +120,8 @@ static int inject_init(void) } module_init(inject_init); -/* Cannot tolerate unloading currently because we cannot +/* + * Cannot tolerate unloading currently because we cannot * guarantee all openers of mce_chrdev will get a reference to us. */ MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 88921be30296e126896ee4d30758f989d1c4ddfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:51 +0200 Subject: x86, mce: synchronize core after machine check handling The example code in the IA32 SDM recommends to synchronize the CPU after machine check handling. So do that here. [ Impact: Spec compliance ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 156cdf6d9181..495c96808668 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -328,6 +328,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't clear MCG_STATUS here because it's only defined for * exceptions. */ + + sync_core(); } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -501,6 +503,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); + sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -- cgit v1.2.3 From b56f642d2bf8c1f7c6499c1e55b23311a33cc796 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86, mce: use extended sysattrs for the check_interval attribute. Instead of using own callbacks use the generic ones provided by the sysdev later. This finally allows to get rid of the ugly ACCESSOR macros. Should also save some text size. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 495c96808668..3cac9da7ce24 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1054,28 +1054,6 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%Lx\n", (u64)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - u64 new = simple_strtoull(buf, &end, 0); \ - \ - if (end == buf) \ - return -EINVAL; \ - var = new; \ - start; \ - \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - static struct sysdev_attribute *bank_attrs; static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1126,13 +1104,26 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t store_int_with_restart(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + ssize_t ret = sysdev_store_int(s, attr, buf, size); + mce_restart(); + return ret; +} + static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval, check_interval, mce_restart()) +static struct sysdev_ext_attribute attr_check_interval = { + _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, + store_int_with_restart), + &check_interval +}; static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, NULL }; -- cgit v1.2.3 From fc016a49c2d92f2efbe22c1fb66eb7a5d2a06ed1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: remove unused mce_events variable Remove unused mce_events static variable. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3cac9da7ce24..69aad7e96a53 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -71,7 +71,6 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; -static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -116,7 +115,6 @@ void mce_log(struct mce *mce) { unsigned next, entry; - atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { -- cgit v1.2.3 From 8be9110569aec1f65d86b08aef7ec49659137bf9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: remove mce_init unused argument Remove unused mce_init argument. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 69aad7e96a53..20c7e7c669d2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -643,7 +643,7 @@ static int mce_cap_init(void) return 0; } -static void mce_init(void *dummy) +static void mce_init(void) { mce_banks_t all_banks; u64 cap; @@ -776,7 +776,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; - mce_init(NULL); + mce_init(); mce_cpu_features(c); mce_init_timer(); } @@ -1020,7 +1020,7 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(NULL); + mce_init(); mce_cpu_features(¤t_cpu_data); return 0; @@ -1030,7 +1030,7 @@ static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); if (mce_available(¤t_cpu_data)) - mce_init(NULL); + mce_init(); mce_init_timer(); } -- cgit v1.2.3 From 32561696c23028596f24b353d98f2e23b58f91f7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: rename and align out2 label There's only a single out path in do_machine_check now, so rename the label from out2 to out. Also align it at the first column. [ Impact: minor cleanup, no functional changes ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 20c7e7c669d2..18d505d80221 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -361,9 +361,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - goto out2; + goto out; if (!banks) - goto out2; + goto out; mce_setup(&m); @@ -499,7 +499,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: +out: atomic_dec(&mce_entry); sync_core(); } -- cgit v1.2.3 From b170204ddb7844ffff62d2d537b20c0eeb97725e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: drop BKL in mce_open BKL is not needed for anything in mce_open because it has an own spinlock. Remove it. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18d505d80221..8ab28368bb92 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -791,12 +790,10 @@ static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { - lock_kernel(); spin_lock(&mce_state_lock); if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); - unlock_kernel(); return -EBUSY; } @@ -806,7 +803,6 @@ static int mce_open(struct inode *inode, struct file *file) open_count++; spin_unlock(&mce_state_lock); - unlock_kernel(); return nonseekable_open(inode, file); } -- cgit v1.2.3 From 9319cec8c185e84fc5281afb6ac5d4c47a234841 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:26:30 +0900 Subject: x86, mce: use strict_strtoull Use strict_strtoull instead of simple_strtoull. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 9 ++++----- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 16 ++++++---------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8ab28368bb92..4375ffb5459f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1059,18 +1059,17 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) + const char *buf, size_t size) { - char *end; - u64 new = simple_strtoull(buf, &end, 0); + u64 new; - if (end == buf) + if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; bank[attr - bank_attrs] = new; mce_restart(); - return end-buf; + return size; } static ssize_t diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 083f270251fa..0c563432e25c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -269,14 +269,12 @@ SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -287,18 +285,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) @@ -313,7 +309,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } struct threshold_block_cross_cpu { -- cgit v1.2.3 From cc3aec52ab8e013984270a79d1aa51f691d239b0 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 15:58:22 +0900 Subject: x86, mce: trivial clean up for therm_throt.c This patch removes following checkpatch warning: WARNING: Use #include instead of +#include Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index a2b5d7ddb19f..7b1ae2e20ba5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -20,7 +20,6 @@ #include #include -#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -- cgit v1.2.3 From 14a02530e2239f753a0f3f089847e723adbdaa47 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 16:04:51 +0900 Subject: x86, mce: trivial clean up for mce.c This fixs following checkpatch warnings: WARNING: Use #include instead of +#include WARNING: Use #include instead of +#include WARNING: line over 80 characters + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); WARNING: braces {} are not necessary for any arm of this statement + if (mce_notify_user()) { [...] + } else { [...] Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4375ffb5459f..1d0aa9c4e15b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -27,14 +28,13 @@ #include #include #include +#include #include #include -#include #include #include #include -#include #include "mce.h" @@ -125,7 +125,8 @@ void mce_log(struct mce *mce) * interesting ones: */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip: */ @@ -556,11 +557,10 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { + if (mce_notify_user()) *n = max(*n/2, HZ/100); - } else { + else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } t->expires = jiffies + *n; add_timer(t); -- cgit v1.2.3 From 34fa1967aa0827776e37feb5666df0327575a0f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: trivial clean up for mce_amd_64.c Fix for followings: WARNING: Use #include instead of +#include ERROR: Macros with multiple statements should be enclosed in a do - while loop +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (cpu >= NR_CPUS) Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 0c563432e25c..ddae21620bda 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -344,17 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) - RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); RW_ATTR(error_count); @@ -675,9 +671,6 @@ static void threshold_remove_device(unsigned int cpu) static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { - if (cpu >= NR_CPUS) - return; - switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: -- cgit v1.2.3 From 61a021a0700c22ee527d73d92f9acb109ff478f8 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:09:04 +0900 Subject: x86, mce: trivial clean up for mce_intel_64.c Fix for: WARNING: space prohibited between function name and open parenthesis '(' + for_each_online_cpu (cpu) { Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 13abafcb72e4..eff3740501a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -196,7 +196,7 @@ void cmci_rediscover(int dying) return; cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu (cpu) { + for_each_online_cpu(cpu) { if (cpu == dying) continue; if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) -- cgit v1.2.3 From 98a9c8c3ba13dfc3df8e6d2a126d2fa4e4621e9c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 28 May 2009 11:41:01 +0900 Subject: x86, mce: trivial clean up for mce-inject.c Fix for: WARNING: Use #include instead of +#include WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) ERROR: trailing whitespace +/* $ Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 673c72855022..7b3a5428396a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -11,13 +11,13 @@ * Andi Kleen * Ying Huang */ +#include #include #include #include #include #include #include -#include #include /* Update fake mce registers on current CPU. */ @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); -- cgit v1.2.3 From eb2a6ab729ac40a553797703a5a5dba3a74de004 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:32:56 +0200 Subject: x86: trivial clean up for irq_vectors.h Fix a wrong comment. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 451e24d18050..233006c4e365 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,11 +86,10 @@ #define CALL_FUNCTION_VECTOR 0xfc #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa - #define THRESHOLD_APIC_VECTOR 0xf9 #ifdef CONFIG_X86_32 -/* 0xf9 : free */ +/* 0xf8 : free */ #else # define UV_BAU_MESSAGE 0xf8 #endif -- cgit v1.2.3 From cd13adcc823aa421efa4efd995fa7004a58cf38d Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 27 May 2009 16:57:31 +0900 Subject: x86: trivial clean up for arch/x86/Kconfig Use tab. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e25b6358fbe3..8c0fff0860bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -822,13 +822,13 @@ config X86_MCE_AMD the DRAM Error Threshold. config X86_ANCIENT_MCE - def_bool n - depends on X86_32 - prompt "Support for old Pentium 5 / WinChip machine checks" - ---help--- - Include support for machine check handling on old Pentium 5 or WinChip - systems. These typically need to be enabled explicitely on the command - line. + def_bool n + depends on X86_32 + prompt "Support for old Pentium 5 / WinChip machine checks" + ---help--- + Include support for machine check handling on old Pentium 5 or WinChip + systems. These typically need to be enabled explicitely on the command + line. config X86_MCE_THRESHOLD depends on X86_MCE_AMD || X86_MCE_INTEL -- cgit v1.2.3 From 38736072d45488fd45f076388b6570419bbbc682 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 28 May 2009 10:05:33 -0700 Subject: x86, mce: drop "extern" from function prototypes in asm/mce.h Function prototypes don't need to be prefixed by "extern". [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index e7d2372301ef..ac6e0303bf21 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -121,13 +121,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -extern int mce_available(struct cpuinfo_x86 *c); +int mce_available(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; -extern void do_machine_check(struct pt_regs *, long); +void do_machine_check(struct pt_regs *, long); typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); @@ -137,15 +137,15 @@ enum mcp_flags { MCP_UC = (1 << 1), /* log uncorrected errors */ MCP_DONTLOG = (1 << 2), /* only clear, don't log */ }; -extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -extern int mce_notify_user(void); +int mce_notify_user(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; #ifdef CONFIG_X86_MCE -extern void mcheck_init(struct cpuinfo_x86 *c); +void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif -- cgit v1.2.3 From 150c7e85526e80474b87004f4b420e8834fdeb43 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 29 Mar 2009 15:39:02 +0800 Subject: crypto: fpu - Add template for blkcipher touching FPU Blkcipher touching FPU need to be enclosed by kernel_fpu_begin() and kernel_fpu_end(). If they are invoked in cipher algorithm implementation, they will be invoked for each block, so that performance will be hurt, because they are "slow" operations. This patch implements "fpu" template, which makes these operations to be invoked for each request. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/fpu.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++ crypto/Kconfig | 5 ++ 3 files changed, 173 insertions(+) create mode 100644 arch/x86/crypto/fpu.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ebe7deedd5b4..cfb0010fa940 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_FPU) += fpu.o + obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c new file mode 100644 index 000000000000..5f9781a3815f --- /dev/null +++ b/arch/x86/crypto/fpu.c @@ -0,0 +1,166 @@ +/* + * FPU: Wrapper for blkcipher touching fpu + * + * Copyright (c) Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +struct crypto_fpu_ctx { + struct crypto_blkcipher *child; +}; + +static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); + struct crypto_blkcipher *child = ctx->child; + int err; + + crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_blkcipher_setkey(child, key, keylen); + crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); + return err; +} + +static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); + struct crypto_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_blkcipher *cipher; + + cipher = crypto_spawn_blkcipher(spawn); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + ctx->child = cipher; + return 0; +} + +static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + crypto_free_blkcipher(ctx->child); +} + +static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +{ + struct crypto_instance *inst; + struct crypto_alg *alg; + int err; + + err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + if (err) + return ERR_PTR(err); + + alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, + CRYPTO_ALG_TYPE_MASK); + if (IS_ERR(alg)) + return ERR_CAST(alg); + + inst = crypto_alloc_instance("fpu", alg); + if (IS_ERR(inst)) + goto out_put_alg; + + inst->alg.cra_flags = alg->cra_flags; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = alg->cra_blocksize; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = alg->cra_type; + inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; + inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; + inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; + inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + inst->alg.cra_init = crypto_fpu_init_tfm; + inst->alg.cra_exit = crypto_fpu_exit_tfm; + inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; + inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; + inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; + +out_put_alg: + crypto_mod_put(alg); + return inst; +} + +static void crypto_fpu_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_fpu_tmpl = { + .name = "fpu", + .alloc = crypto_fpu_alloc, + .free = crypto_fpu_free, + .module = THIS_MODULE, +}; + +static int __init crypto_fpu_module_init(void) +{ + return crypto_register_template(&crypto_fpu_tmpl); +} + +static void __exit crypto_fpu_module_exit(void) +{ + crypto_unregister_template(&crypto_fpu_tmpl); +} + +module_init(crypto_fpu_module_init); +module_exit(crypto_fpu_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FPU block cipher wrapper"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 74d0e622a515..66ff22a36ed9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -241,6 +241,11 @@ config CRYPTO_XTS key size 256, 384 or 512 bits. This implementation currently can't handle a sectorsize which is not a multiple of 16 bytes. +config CRYPTO_FPU + tristate + select CRYPTO_BLKCIPHER + select CRYPTO_MANAGER + comment "Hash modes" config CRYPTO_HMAC -- cgit v1.2.3 From 2cf4ac8beb9dc50a315a6155b7b70e754d511958 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 29 Mar 2009 15:41:20 +0800 Subject: crypto: aes-ni - Add support for more modes Because kernel_fpu_begin() and kernel_fpu_end() operations are too slow, the performance gain of general mode implementation + aes-aesni is almost all compensated. The AES-NI support for more modes are implemented as follow: - Add a new AES algorithm implementation named __aes-aesni without kernel_fpu_begin/end() - Use fpu((AES)) to provide kenrel_fpu_begin/end() invoking - Add (AES) ablkcipher, which uses cryptd(fpu((AES))) to defer cryption to cryptd context in soft_irq context. Now the ctr, lrw, pcbc and xts support are added. Performance testing based on dm-crypt shows that cryption time can be reduced to 50% of general mode implementation + aes-aesni implementation. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 267 ++++++++++++++++++++++++++++++++++++- crypto/Kconfig | 5 + 2 files changed, 271 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 02af0af65497..4e663398f77f 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,6 +21,22 @@ #include #include +#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) +#define HAS_CTR +#endif + +#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) +#define HAS_LRW +#endif + +#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) +#define HAS_PCBC +#endif + +#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) +#define HAS_XTS +#endif + struct async_aes_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = { } }; +static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_enc(ctx, dst, src); +} + +static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_dec(ctx, dst, src); +} + +static struct crypto_alg __aesni_alg = { + .cra_name = "__aes-aesni", + .cra_driver_name = "__driver-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = __aes_encrypt, + .cia_decrypt = __aes_decrypt + } + } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; - return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; } static int ablk_encrypt(struct ablkcipher_request *req) @@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = { }, }; +#ifdef HAS_CTR +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ctr_alg = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "chainiv", + }, + }, +}; +#endif + +#ifdef HAS_LRW +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_lrw_alg = { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_PCBC +static int ablk_pcbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_pcbc_alg = { + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), + .cra_init = ablk_pcbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_XTS +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_xts_alg = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + static int __init aesni_init(void) { int err; @@ -421,6 +637,8 @@ static int __init aesni_init(void) } if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; + if ((err = crypto_register_alg(&__aesni_alg))) + goto __aes_err; if ((err = crypto_register_alg(&blk_ecb_alg))) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) @@ -429,9 +647,41 @@ static int __init aesni_init(void) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; +#ifdef HAS_CTR + if ((err = crypto_register_alg(&ablk_ctr_alg))) + goto ablk_ctr_err; +#endif +#ifdef HAS_LRW + if ((err = crypto_register_alg(&ablk_lrw_alg))) + goto ablk_lrw_err; +#endif +#ifdef HAS_PCBC + if ((err = crypto_register_alg(&ablk_pcbc_alg))) + goto ablk_pcbc_err; +#endif +#ifdef HAS_XTS + if ((err = crypto_register_alg(&ablk_xts_alg))) + goto ablk_xts_err; +#endif return err; +#ifdef HAS_XTS +ablk_xts_err: +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +ablk_pcbc_err: +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +ablk_lrw_err: +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +ablk_ctr_err: +#endif + crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: @@ -439,6 +689,8 @@ ablk_ecb_err: blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); blk_ecb_err: + crypto_unregister_alg(&__aesni_alg); +__aes_err: crypto_unregister_alg(&aesni_alg); aes_err: return err; @@ -446,10 +698,23 @@ aes_err: static void __exit aesni_exit(void) { +#ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +#endif crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_alg(&__aesni_alg); crypto_unregister_alg(&aesni_alg); } diff --git a/crypto/Kconfig b/crypto/Kconfig index 66ff22a36ed9..4dfdd03e708f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -491,6 +491,7 @@ config CRYPTO_AES_NI_INTEL select CRYPTO_AES_X86_64 select CRYPTO_CRYPTD select CRYPTO_ALGAPI + select CRYPTO_FPU help Use Intel AES-NI instructions for AES algorithm. @@ -510,6 +511,10 @@ config CRYPTO_AES_NI_INTEL See for more information. + In addition to AES cipher algorithm support, the + acceleration for some popular block cipher mode is supported + too, including ECB, CBC, CTR, LRW, PCBC, XTS. + config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" select CRYPTO_ALGAPI -- cgit v1.2.3 From 01ca79f1411eae2a45352709c838b946b1af9fbd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86, mce: add machine check exception count in /proc/interrupts Useful for debugging, but it's also good general policy to have a counter for all special interrupts there. This makes it easier to diagnose where a CPU is spending its time. [ Impact: feature, debugging tool ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 10 ++++++++++ 3 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ac6e0303bf21..1156dae295ac 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -89,6 +89,7 @@ struct mce_log { extern int mce_disabled; #include +#include void mce_setup(struct mce *m); void mce_log(struct mce *m); @@ -123,6 +124,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } int mce_available(struct cpuinfo_x86 *c); +DECLARE_PER_CPU(unsigned, mce_exception_count); + void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1d0aa9c4e15b..287268d21836 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,6 +57,8 @@ int mce_disabled; atomic_t mce_entry; +DEFINE_PER_CPU(unsigned, mce_exception_count); + /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -359,6 +361,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); + __get_cpu_var(mce_exception_count)++; + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) goto out; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index a05660bf0299..05fc635c28c0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -93,6 +94,12 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif +#endif +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -161,6 +168,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + sum += per_cpu(mce_exception_count, cpu); +#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; -- cgit v1.2.3 From ca84f69697da0f004135e45b63ca560b6bd3554e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: add MCE poll count to /proc/interrupts Keep a count of the machine check polls (or CMCI events) in /proc/interrupts. Andi needs this for debugging, but it's also useful in general to see what's going in by the kernel. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 4 ++++ 3 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1156dae295ac..63abf3b19432 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -125,6 +125,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } int mce_available(struct cpuinfo_x86 *c); DECLARE_PER_CPU(unsigned, mce_exception_count); +DECLARE_PER_CPU(unsigned, mce_poll_count); void mce_log_therm_throt_event(__u64 status); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 287268d21836..784f6ae9d6f4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -264,6 +264,8 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +DEFINE_PER_CPU(unsigned, mce_poll_count); + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -275,6 +277,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; + __get_cpu_var(mce_poll_count)++; + mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 05fc635c28c0..eff46b5de62f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -100,6 +100,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.3 From f6fb0ac0869500323c78fa21992fe1933af61e91 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: store record length into memory struct mce anchor This makes it easier for tools who want to extract the mcelog out of crash images or memory dumps to adapt to changing struct mce size. The length field replaces padding, so it's fully compatible. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 63abf3b19432..0a61946d4396 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -59,7 +59,7 @@ struct mce_log { unsigned len; /* = MCE_LOG_LEN */ unsigned next; unsigned flags; - unsigned pad0; + unsigned recordlen; /* length of struct mce */ struct mce entry[MCE_LOG_LEN]; }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 784f6ae9d6f4..3db047e7a0fb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -108,8 +108,9 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); */ static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), }; void mce_log(struct mce *mce) -- cgit v1.2.3 From d620c67fb92aa11736112f9a03e31d8e3079c57a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: support more than 256 CPUs in struct mce The old struct mce had a limitation to 256 CPUs. But x86 Linux supports more than that now with x2apic. Add a new field extcpu to report the extended number. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 4 ++-- arch/x86/kernel/cpu/mcheck/mce-inject.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 0a61946d4396..b4a04b60b740 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,9 +40,9 @@ struct mce { __u64 res2; /* dito. */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu that raised the error */ + __u8 cpu; /* cpu number; obsolete; use extcpu now */ __u8 finished; /* entry is valid */ - __u32 pad; + __u32 extcpu; /* linux cpu number that detected the error */ }; /* diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7b3a5428396a..7d858fb4ce67 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -23,14 +23,14 @@ /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { - struct mce *i = &per_cpu(injectm, m->cpu); + struct mce *i = &per_cpu(injectm, m->extcpu); /* Make sure noone reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; /* First set the fields after finished */ - i->cpu = m->cpu; + i->extcpu = m->extcpu; mb(); /* Now write record in order, finished last (except above) */ memcpy(i, m, sizeof(struct mce)); @@ -49,7 +49,7 @@ static void raise_mce(unsigned long data) { struct delayed_mce *dm = (struct delayed_mce *)data; struct mce *m = &dm->m; - int cpu = m->cpu; + int cpu = m->extcpu; inject_mce(m); if (m->status & MCI_STATUS_UC) { @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); @@ -108,7 +108,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, memcpy(&dm->m, &m, sizeof(struct mce)); setup_timer(&dm->timer, raise_mce, (unsigned long)dm); dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.cpu); + add_timer_on(&dm->timer, m.extcpu); return usize; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3db047e7a0fb..2c4dd6c422c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -94,7 +94,7 @@ static inline int skip_bank_init(int i) void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); + m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); } @@ -158,7 +158,7 @@ static void print_mce(struct mce *m) KERN_EMERG "HARDWARE ERROR\n" KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); + m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", -- cgit v1.2.3 From 8ee08347c1e8b5680b3b3ce081e42e97bcaa1abe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: extend struct mce user interface with more information. Experience has shown that struct mce which is used to pass an machine check to the user space daemon currently a few limitations. Also some data which is useful to print at panic level is also missing. This patch addresses most of them. The same information is also printed out together with mce panic. struct mce can be painlessly extended in a compatible way, the mcelog user space code just ignores additional fields with a warning. - It doesn't provide a wall time timestamp. There have been a few complaints about that. Fix that by adding a 64bit time_t - It doesn't provide the exact CPU identification. This makes it awkward for mcelog to decode the event correctly, especially when there are variations in the supported MCE codes on different CPU models or when mcelog is running on a different host after a panic. Previously the administrator had to specify the correct CPU when mcelog ran on a different host, but with the more variation in machine checks now it's better to auto detect that. It's also useful for more detailed analysis of CPU events. Pass CPUID 1.EAX and the cpu vendor (as encoded in processor.h) instead. - Socket ID and initial APIC ID are useful to report because they allow to identify the failing CPU in some (not all) cases. This is also especially useful for the panic situation. This addresses one of the complaints from Thomas Gleixner earlier. - The MCG capabilities MSR needs to be reported for some advanced error processing in mcelog Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 ++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b4a04b60b740..ba1f8890cf51 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -36,13 +36,19 @@ struct mce { __u64 mcgstatus; __u64 ip; __u64 tsc; /* cpu time stamp counter */ - __u64 res1; /* for future extension */ - __u64 res2; /* dito. */ + __u64 time; /* wall time_t when error was detected */ + __u8 cpuvendor; /* cpu vendor as encoded in system.h */ + __u8 pad1; + __u16 pad2; + __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ __u8 cpu; /* cpu number; obsolete; use extcpu now */ __u8 finished; /* entry is valid */ __u32 extcpu; /* linux cpu number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ }; /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2c4dd6c422c3..ba68449c22a1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -96,6 +96,15 @@ void mce_setup(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); + /* We hope get_seconds stays lockless */ + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP + m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } DEFINE_PER_CPU(struct mce, injectm); @@ -173,6 +182,9 @@ static void print_mce(struct mce *m) if (m->misc) printk("MISC %llx ", m->misc); printk("\n"); + printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, + m->apicid); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " "and contact your hardware vendor\n"); -- cgit v1.2.3 From de8a84d85ad8bb46d01d72ebc57030b95075603c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: log corrected errors when panicing Normally the machine check handler ignores corrected errors and leaves them to machine_check_poll(). But when panicing mcp won't run, so log all errors. Note: this can still miss some cases until the "early no way out" patch later is applied too. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba68449c22a1..86806e52fc40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -412,9 +412,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. + * Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0) + if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) continue; /* -- cgit v1.2.3 From a0189c70e5f17f4253dd7bc575c97469900e23d6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: remove TSC print heuristic Previously mce_panic used a simple heuristic to avoid printing old so far unreported machine check events on a mce panic. This worked by comparing the TSC value at the start of the machine check handler with the event time stamp and only printing newer ones. This has a couple of issues, in particular on systems where the TSC is not fully synchronized between CPUs it could lose events or print old ones. It is also problematic with full system synchronization as it is added by the next patch. Remove the TSC heuristic and instead replace it with a simple heuristic to print corrected errors first and after that uncorrected errors and finally the worst machine check as determined by the machine check handler. This simplifies the code because there is no need to pass the original TSC value around. Contains fixes from Ying Huang [ Impact: bug fix, cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Ying Huang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 86806e52fc40..6773610061d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -158,6 +158,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); + mce->finished = 1; set_bit(0, ¬ify_user); } @@ -190,23 +191,29 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, u64 start) +static void mce_panic(char *msg, struct mce *final) { int i; bust_spinlocks(1); console_verbose(); + /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { - u64 tsc = mcelog.entry[i].tsc; - - if ((s64)(tsc - start) < 0) + struct mce *m = &mcelog.entry[i]; + if ((m->status & MCI_STATUS_VAL) && + !(m->status & MCI_STATUS_UC)) + print_mce(m); + } + /* Now print uncorrected but with the final one last */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; + if (!final || memcmp(m, final, sizeof(struct mce))) + print_mce(m); } - if (backup) - print_mce(backup); + if (final) + print_mce(final); panic(msg); } @@ -362,7 +369,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; int panicm_found = 0; - u64 mcestart = 0; int i; /* * If no_way_out gets set, there is no safe way to recover from this @@ -394,7 +400,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; - rdtscll(mcestart); barrier(); for (i = 0; i < banks; i++) { @@ -478,7 +483,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); + mce_panic("Machine check", &panicm); /* * If the error seems to be unrecoverable, something should be @@ -506,8 +511,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); + mce_panic("Uncorrected machine check", &panicm); } } -- cgit v1.2.3 From 817f32d02a52dd7f5941534e0699883691e918df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: add table driven machine check grading The machine check grading (as in deciding what should be done for a given register value) has to be done multiple times soon and it's also getting more complicated. So it makes sense to consolidate it into a single function. To get smaller and more straight forward and possibly more extensible code I opted towards a new table driven method. The various rules are put into a table when is then executed by a very simple interpreter. The grading engine is in a new file mce-severity.c. I also added a private include file mce-internal.h, because mce.h is already a bit too cluttered. This is dead code right now, but will be used in followon patches. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-internal.h | 10 +++++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 61 +++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-internal.h create mode 100644 arch/x86/kernel/cpu/mcheck/mce-severity.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 60ee182c6c52..45004faf67ea 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,5 +1,6 @@ obj-y = mce.o therm_throt.o +obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 000000000000..f126b4ae7a25 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,10 @@ +#include + +enum severity_level { + MCE_NO_SEVERITY, + MCE_SOME_SEVERITY, + MCE_UC_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 000000000000..c189e89a89ae --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,61 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include +#include + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + */ + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + char *msg; +} severities[] = { +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ + { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), + BITCLR(MCI_STATUS_EN, NO, "Not enabled"), + BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), + BITSET(MCI_STATUS_UC, UC, "Uncorrected"), + BITSET(0, SOME, "No match") /* always matches. keep at end */ +}; + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ + struct severity *s; + for (s = severities;; s++) { + if ((a->status & s->mask) != s->result) + continue; + if ((a->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && + tolerant < 1) + return MCE_PANIC_SEVERITY; + if (msg) + *msg = s->msg; + return s->sev; + } +} -- cgit v1.2.3 From bd19a5e6b73df276e1ccedf9059e9ee70c372d7d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: check early in exception handler if panic is needed The exception handler should behave differently if the exception is fatal versus one that can be returned from. In the first case it should never clear any registers because these need to be preserved for logging after the next boot. Otherwise it should clear them on each CPU step by step so that other CPUs sharing the same bank don't see duplicate events. Otherwise we risk reporting events multiple times on any CPUs which have shared machine check banks, which is a common problem on Intel Nehalem which has both SMT (two CPU threads sharing banks) and shared machine check banks in the uncore. Determine early in a special pass if any event requires a panic. This uses the mce_severity() function added earlier. This is needed for the next patch. Also fixes a problem together with an earlier patch that corrected events weren't logged on a fatal MCE. [ Impact: Feature ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6773610061d8..5031814ac943 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,6 +36,7 @@ #include #include +#include "mce-internal.h" #include "mce.h" /* Handle unconfigured int18 (should never happen) */ @@ -191,7 +192,7 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *final) +static void mce_panic(char *msg, struct mce *final, char *exp) { int i; @@ -214,6 +215,8 @@ static void mce_panic(char *msg, struct mce *final) } if (final) print_mce(final); + if (exp) + printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); } @@ -357,6 +360,22 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) } EXPORT_SYMBOL_GPL(machine_check_poll); +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ + int i; + + for (i = 0; i < banks; i++) { + m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + return 1; + } + return 0; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -381,6 +400,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -395,10 +415,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; + no_way_out = mce_no_way_out(&m, &msg); barrier(); @@ -430,18 +447,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) __set_bit(i, toclear); if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); /* * If this error was uncorrectable and there was * an overflow, we're in trouble. If no overflow, * we might get away with just killing a task. */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; + if (m.status & MCI_STATUS_UC) kill_it = 1; - } } else { /* * Machine check event was not enabled. Clear, but @@ -483,7 +495,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm); + mce_panic("Machine check", &panicm, msg); /* * If the error seems to be unrecoverable, something should be @@ -511,7 +523,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm); + mce_panic("Uncorrected machine check", &panicm, msg); } } -- cgit v1.2.3 From ccc3c3192ae78dd56dcdf5353fd1a9ef5f9a3e2b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: implement bootstrapping for machine check wakeups Machine checks support waking up the mcelog daemon quickly. The original wake up code for this was pretty ugly, relying on a idle notifier and a special process flag. The reason it did it this way is that the machine check handler is not subject to normal interrupt locking rules so it's not safe to call wake_up(). Instead it set a process flag and then either did the wakeup in the syscall return or in the idle notifier. This patch adds a new "bootstraping" method as replacement. The idea is that the handler checks if it's in a state where it is unsafe to call wake_up(). If it's safe it calls it directly. When it's not safe -- that is it interrupted in a critical section with interrupts disables -- it uses a new "self IPI" to trigger an IPI to its own CPU. This can be done safely because IPI triggers are atomic with some care. The IPI is raised once the interrupts are reenabled and can then safely call wake_up(). When APICs are disabled the event is just queued and will be picked up eventually by the next polling timer. I think that's a reasonable compromise, since it should only happen quite rarely. Contains fixes from Ying Huang. [ solve conflict on irqinit, make it work on 32bit (entry_arch.h) - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/entry_arch.h | 4 +++ arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 54 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 ++++ arch/x86/kernel/irqinit.c | 3 +++ 6 files changed, 72 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index b2eb9c066843..4cdcf5a3c96b 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -60,4 +60,8 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_NEW_MCE +BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a7d14bbae110..4e59197e29ba 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -32,6 +32,7 @@ extern void error_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); +extern void mce_self_interrupt(void); extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8c46b851296a..68f7cf84a333 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -117,6 +117,11 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +/* + * Self IPI vector for machine checks + */ +#define MCE_SELF_VECTOR 0xeb + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5031814ac943..121781627858 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,10 @@ #include #include +#include +#include #include +#include #include #include @@ -287,6 +291,54 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + mce_notify_user(); + irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_user(); + return; + } + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Without APIC do not notify. The event will be picked + * up eventually. + */ + if (!cpu_has_apic) + return; + + /* + * When interrupts are disabled we cannot use + * kernel services safely. Trigger an self interrupt + * through the APIC to instead do the notification + * after interrupts are reenabled again. + */ + apic->send_IPI_self(MCE_SELF_VECTOR); + + /* + * Wait for idle afterwards again so that we don't leave the + * APIC in a non idle state because the normal APIC writes + * cannot exclude us. + */ + apic_wait_icr_idle(); +#endif +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -530,6 +582,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); + mce_report_event(regs); + /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a31a7f29cffe..711c130a8411 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1011,6 +1011,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index aab3d277766c..441f6ec6e9d4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,6 +187,9 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) + alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ -- cgit v1.2.3 From f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: implement panic synchronization In some circumstances multiple CPUs can enter mce_panic() in parallel. This gives quite confused output because they will all dump the same machine check buffer. The other problem is that they would all panic in parallel, but not process each other's shutdown IPIs because interrupts are disabled. Detect this situation early on in mce_panic(). On the first CPU entering will do the panic, the others will just wait to be killed. For paranoia reasons in case the other CPU dies during the MCE I added a 5 seconds timeout. If it expires each CPU will panic on its own again. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 121781627858..421020f1d7db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -196,10 +196,32 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + panic("Panicing machine check CPU died"); +} + static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_add_return(1, &mce_paniced) > 1) + wait_for_panic(); + barrier(); + bust_spinlocks(1); console_verbose(); /* First print corrected ones that are still unlogged */ -- cgit v1.2.3 From 3c0797925f4ef9d55a32059d2af61a9c262e639d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: switch x86 machine check handler to Monarch election. On Intel platforms machine check exceptions are always broadcast to all CPUs. This patch makes the machine check handler synchronize all these machine checks, elect a Monarch to handle the event and collect the worst event from all CPUs and then process it first. This has some advantages: - When there is a truly data corrupting error the system panics as quickly as possible. This improves containment of corrupted data and makes sure the corrupted data never hits stable storage. - The panics are synchronized and do not reenter the panic code on multiple CPUs (which currently does not handle this well). - All the errors are reported. Currently it often happens that another CPU happens to do the panic first, but reports useless information (empty machine check) because the real error happened on another CPU which came in later. This is a big advantage on Nehalem where the 8 threads per CPU lead to often the wrong CPU winning the race and dumping useless information on a machine check. The problem also occurs in a less severe form on older CPUs. - The system can detect when no CPUs detected a machine check and shut down the system. This can happen when one CPU is so badly hung that that it cannot process a machine check anymore or when some external agent wants to stop the system by asserting the machine check pin. This follows Intel hardware recommendations. - This matches the recommended error model by the CPU designers. - The events can be output in true severity order - When a panic happens on another CPU it makes sure to be actually be able to process the stop IPI by enabling interrupts. The code is extremly careful to handle timeouts while waiting for other CPUs. It can't rely on the normal timing mechanisms (jiffies, ktime_get) because of its asynchronous/lockless nature, so it uses own timeouts using ndelay() and a "SPINUNIT" The timeout is configurable. By default it waits for upto one second for the other CPUs. This can be also disabled. From some informal testing AMD systems do not see to broadcast machine checks, so right now it's always disabled by default on non Intel CPUs or also on very old Intel systems. Includes fixes from Ying Huang Fixed a "ecception" in a comment (H.Seto) Moved global_nwo reset later based on suggestion from H.Seto v2: Avoid duplicate messages [ Impact: feature, fixes long standing problems. ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 6 +- Documentation/x86/x86_64/machinecheck | 4 + arch/x86/kernel/cpu/mcheck/mce.c | 360 +++++++++++++++++++++++++++--- 3 files changed, 340 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 63fca718256e..0ee5e3b212f3 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -15,13 +15,17 @@ Machine check in a reboot. On Intel systems it is enabled by default. mce=nobootlog Disable boot machine check logging. - mce=tolerancelevel (number) + mce=tolerancelevel[,monarchtimeout] (number,number) + tolerance levels: 0: always panic on uncorrected errors, log corrected errors 1: panic or SIGBUS on uncorrected errors, log corrected errors 2: SIGBUS or log uncorrected errors, log corrected errors 3: never panic or SIGBUS, log all errors (for testing only) Default is 1 Can be also set using sysfs which is preferable. + monarchtimeout: + Sets the time in us to wait for other CPUs on machine checks. 0 + to disable. nomce (for compatibility with i386): same as mce=off diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck index a4fdb25446e0..b1fb30273286 100644 --- a/Documentation/x86/x86_64/machinecheck +++ b/Documentation/x86/x86_64/machinecheck @@ -69,6 +69,10 @@ trigger Program to run when a machine check event is detected. This is an alternative to running mcelog regularly from cron and allows to detect events faster. +monarch_timeout + How long to wait for the other CPUs to machine check too on a + exception. 0 to disable waiting for other CPUs. + Unit: us TBD document entries for AMD threshold interrupt configuration diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 421020f1d7db..ba431893e31d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +62,8 @@ int mce_disabled; #define MISC_MCELOG_MINOR 227 +#define SPINUNIT 100 /* 100ns */ + atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); @@ -77,6 +81,7 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; +static int monarch_timeout = -1; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL }; static unsigned long dont_init_banks; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int cpu_missing; + /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) } if (final) print_mce(final); + if (cpu_missing) + printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); @@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg) return 0; } +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_paniced)) + wait_for_panic(); + if (!monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + /* CHECKME: Make panic default for 1 too? */ + if (tolerant < 1) + mce_panic("Timeout synchronizing machine check over CPUs", + NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + &nmsg); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + mce_panic("Fatal machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (!m && tolerant < 3) + mce_panic("Machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int no_way_out, int *order) +{ + int nwo; + int cpus = num_online_cpus(); + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) { + *order = -1; + return no_way_out; + } + + atomic_add(no_way_out, &global_nwo); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + + /* + * Cache the global no_way_out state. + */ + nwo = atomic_read(&global_nwo); + + /* + * Monarch starts executing now, the others wait. + */ + if (*order == 1) { + atomic_set(&mce_executing, 1); + return nwo; + } + + /* + * Now start the scanning loop one by one + * in the original callin order. + * This way when there are any shared banks it will + * be only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < *order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + return nwo; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg) * This is executed in NMI context not subject to normal locking rules. This * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. */ void do_machine_check(struct pt_regs *regs, long error_code) { - struct mce m, panicm; - int panicm_found = 0; + struct mce m, *final; int i; + int worst = 0; + int severity; + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; + order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); no_way_out = mce_no_way_out(&m, &msg); + final = &__get_cpu_var(mces_seen); + *final = m; + barrier(); + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + */ + no_way_out = mce_start(no_way_out, &order); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) @@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* - * Did this bank cause the exception? - * - * Assume that the bank with uncorrectable errors did it, - * and that there is only a single one: - */ - if ((m.status & MCI_STATUS_UC) && - (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; + severity = mce_severity(&m, tolerant, NULL); + if (severity > worst) { + *final = m; + worst = severity; } } + if (!no_way_out) + mce_clear_state(toclear); + /* - * If we didn't find an uncorrectable error, pick - * the last one (shouldn't happen, just being safe). + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. */ - if (!panicm_found) - panicm = m; + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; /* * If we have decided that we just CAN'T continue, and the user * has not set tolerant to an insane level, give up and die. + * + * This is mainly used in the case when the system doesn't + * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, msg); + mce_panic("Machine check", final, msg); /* * If the error seems to be unrecoverable, something should be @@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); + user_space = final->ip && (final->cs & 3); /* * If we know that the error was in user space, send a @@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm, msg); + mce_panic("Uncorrected machine check", final, msg); } } /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - mce_report_event(regs); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } + if (worst > 0) + mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: atomic_dec(&mce_entry); @@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 0x1A) __set_bit(0, &dont_init_banks); + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + monarch_timeout < 0) + monarch_timeout = USEC_PER_SEC; } + if (monarch_timeout < 0) + monarch_timeout = 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = { /* * mce=off disables machine check - * mce=TOLERANCELEVEL (number, see above) + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. */ @@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); - else if (isdigit(str[0])) + else if (isdigit(str[0])) { get_option(&str, &tolerant); - else { + if (*str == ',') { + ++str; + get_option(&str, &monarch_timeout); + } + } else { printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; @@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = { static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_monarch_timeout.attr, NULL }; -- cgit v1.2.3 From ac9603754dc7e286e62ae4f1067958d5b0075f99 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: make non Monarch panic message "Fatal machine check" too ... instead of "Machine check". This is for consistency with the Monarch panic message. Based on a report from Ying Huang. v2: But add a descriptive postfix so that the test suite can distingush. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba431893e31d..d5cb0b4c17ff 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -554,7 +554,7 @@ static void mce_reign(void) * other CPUs. */ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal Machine check", m, msg); /* * For UC somewhere we let the CPU who detects it handle it. @@ -858,7 +858,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", final, msg); + mce_panic("Fatal machine check on current CPU", final, msg); /* * If the error seems to be unrecoverable, something should be -- cgit v1.2.3 From 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:51 +0200 Subject: x86, mce: improve mce_get_rip Assume IP on the stack is valid when either EIPV or RIPV are set. This influences whether the machine check exception handler decides to return or panic. This fixes a test case in the mce-test suite and is more compliant to the specification. This currently only makes a difference in a artificial testing scenario with the mce-test test suite. Also in addition do not force the EIPV to be valid with the exact register MSRs, and keep in trust the CS value on stack even if MSR is available. [AK: combination of patches from Huang Ying and Hidetoshi Seto, with new description by me] [add some description, no code changed - HS] Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d5cb0b4c17ff..a7dc369a9974 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -306,21 +306,22 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +/* + * Get the address of the instruction at the time of the machine check + * error. + */ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + + if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { m->ip = regs->ip; m->cs = regs->cs; } else { m->ip = 0; m->cs = 0; } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; + if (rip_msr) m->ip = mce_rdmsrl(rip_msr); - m->cs = 0; - } } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3 From 29b0f591d678838435fbb3e15ef20266f1a9e01d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: default to panic timeout for machine checks Fatal machine checks can be logged to disk after boot, but only if the system did a warm reboot. That's unfortunately difficult with the default panic behaviour, which waits forever and the admin has to press the power button because modern systems usually miss a reset button. This clears the machine checks in the registers and make it impossible to log them. This patch changes the default for machine check panic to always reboot after 30s. Then the mce can be successfully logged after reboot. I believe this will improve machine check experience for any system running the X server. This is dependent on successfull boot logging of MCEs. This currently only works on Intel systems, on AMD there are quite a lot of systems around which leave junk in the machine check registers after boot, so it's disabled here. These systems will continue to default to endless waiting panic. v2: Only force panic timeout when it's shorter (H.Seto) v3: Only force timeout when there is no timeout (based on comment H.Seto) [ Fix changelog - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7dc369a9974..79d243145b8f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -82,6 +82,7 @@ static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; +static int mce_panic_timeout; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -216,6 +217,8 @@ static void wait_for_panic(void) local_irq_enable(); while (timeout-- > 0) udelay(1); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic("Panicing machine check CPU died"); } @@ -253,6 +256,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic(msg); } @@ -1117,6 +1122,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } if (monarch_timeout < 0) monarch_timeout = 0; + if (mce_bootlog != 0) + mce_panic_timeout = 30; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 86503560e48153aba539ff117450d31ab2ef76d7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: print header/footer only once for multiple MCEs When multiple MCEs are printed print the "HARDWARE ERROR" header and "This is not a software error" footer only once. This makes the output much more compact with many CPUs. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d243145b8f..ff9c732989de 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -176,11 +176,13 @@ void mce_log(struct mce *mce) set_bit(0, ¬ify_user); } -static void print_mce(struct mce *m) +static void print_mce(struct mce *m, int *first) { - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG + if (*first) { + printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + *first = 0; + } + printk(KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { @@ -200,9 +202,12 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); +} + +static void print_mce_tail(void) +{ + printk(KERN_EMERG "This is not a software problem!\n" + KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -225,6 +230,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + int first = 1; /* * Make sure only one CPU runs in machine check panic @@ -240,7 +246,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) struct mce *m = &mcelog.entry[i]; if ((m->status & MCI_STATUS_VAL) && !(m->status & MCI_STATUS_UC)) - print_mce(m); + print_mce(m, &first); } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -248,12 +254,13 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (!(m->status & MCI_STATUS_VAL)) continue; if (!final || memcmp(m, final, sizeof(struct mce))) - print_mce(m); + print_mce(m, &first); } if (final) - print_mce(final); + print_mce(final, &first); if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); + print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); if (panic_timeout == 0) -- cgit v1.2.3 From ed7290d0ee8f81aa78bfe816f01b012f208cafc5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: implement new status bits The x86 architecture recently added some new machine check status bits: S(ignalled) and AR (Action-Required). Signalled allows to check if a specific event caused an exception or was just logged through CMCI. AR allows the kernel to decide if an event needs immediate action or can be delayed or ignored. Implement support for these new status bits. mce_severity() uses the new bits to grade the machine check correctly and decide what to do. The exception handler uses AR to decide to kill or not. The S bit is used to separate events between the poll/CMCI handler and the exception handler. Classical UC always leads to panic. That was true before anyways because the existing CPUs always passed a PCC with it. Also corrects the rules whether to kill in user or kernel context and how to handle missing RIPV. The machine check handler largely uses the mce-severity grading engine now instead of making its own decisions. This means the logic is centralized in one place. This is useful because it has to be evaluated multiple times. v2: Some rule fixes; Add AO events Fix RIPV, RIPV|EIPV order (Ying Huang) Fix UCNA with AR=1 message (Ying Huang) Add comment about panicing in m_c_p. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 ++++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 5 ++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 82 ++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 84 ++++++++++++++++--------------- 4 files changed, 137 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ba1f8890cf51..afd3cdf6f8ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -15,6 +15,7 @@ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ @@ -27,6 +28,15 @@ #define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ #define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ + +/* MISC register defines */ +#define MCM_ADDR_SEGOFF 0 /* segment offset */ +#define MCM_ADDR_LINEAR 1 /* linear address */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCM_ADDR_MEM 3 /* memory address */ +#define MCM_ADDR_GENERIC 7 /* generic */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index f126b4ae7a25..54dcb8ff12e5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -2,9 +2,14 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, MCE_UC_SEVERITY, + MCE_AR_SEVERITY, MCE_PANIC_SEVERITY, }; int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c189e89a89ae..4f4d2caf4043 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -19,43 +19,117 @@ * first. Since there are quite a lot of combinations test the bits in a * table-driven way. The rules are simply processed in order, first * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) */ +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + static struct severity { u64 mask; u64 result; unsigned char sev; unsigned char mcgmask; unsigned char mcgres; + unsigned char ser; + unsigned char context; char *msg; } severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER #define SEV(s) .sev = MCE_ ## s ## _SEVERITY #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } #define MCGMASK(x, res, s, m, r...) \ { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ + { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), BITCLR(MCI_STATUS_EN, NO, "Not enabled"), BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + /* When MCIP is not set something is very confused */ + MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, + "Neither restart nor error IP"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", + KERNEL), + BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), + MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, + "Spurious not enabled", SER), + + /* ignore OVER for UCNA */ + MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, + "Uncorrected no action required", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, + "Illegal combination (UCNA with AR=1)", SER), + MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + + /* AR add known MCACODs here */ + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, + "Action required with lost events", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, + "Action required; unknown MCACOD", SER), + + /* known AO MCACODs: */ + MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, + "Action optional: memory scrubbing error", SER), + MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, + "Action optional: last level cache writeback error", SER), + + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, + "Action optional unknown MCACOD", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, + "Action optional with lost events", SER), BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), BITSET(MCI_STATUS_UC, UC, "Uncorrected"), BITSET(0, SOME, "No match") /* always matches. keep at end */ }; +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ + if (m->mcgstatus & MCG_STATUS_EIPV) + return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + /* Unknown, assume kernel */ + return IN_KERNEL; +} + int mce_severity(struct mce *a, int tolerant, char **msg) { + enum context ctx = error_context(a); struct severity *s; + for (s = severities;; s++) { if ((a->status & s->mask) != s->result) continue; if ((a->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && - tolerant < 1) - return MCE_PANIC_SEVERITY; + if (s->ser == SER_REQUIRED && !mce_ser) + continue; + if (s->ser == NO_SER && mce_ser) + continue; + if (s->context && ctx != s->context) + continue; if (msg) *msg = s->msg; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (panic_on_oops || tolerant < 1) + return MCE_PANIC_SEVERITY; + } return s->sev; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ff9c732989de..f051a7807ab4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; static int mce_panic_timeout; +int mce_ser; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -391,6 +392,15 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * Those are just logged through /dev/mcelog. * * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. */ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { @@ -417,13 +427,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. * * TBD do the same check for MCI_STATUS_EN here? */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + if (!(flags & MCP_UC) && + (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; if (m.status & MCI_STATUS_MISCV) @@ -789,6 +799,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); + /* + * When no restart IP must always kill or panic. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + /* * Go through all the banks in exclusion of the other CPUs. * This way we don't report duplicated events on shared banks @@ -809,10 +825,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone, unless this panics. + * Non uncorrected or non signaled errors are handled by + * machine_check_poll. Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) + if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) continue; /* @@ -820,17 +837,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK); - __set_bit(i, toclear); + severity = mce_severity(&m, tolerant, NULL); - if (m.status & MCI_STATUS_EN) { - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) - kill_it = 1; - } else { + /* + * When machine check was for corrected handler don't touch, + * unless we're panicing. + */ + if (severity == MCE_KEEP_SEVERITY && !no_way_out) + continue; + __set_bit(i, toclear); + if (severity == MCE_NO_SEVERITY) { /* * Machine check event was not enabled. Clear, but * ignore. @@ -838,6 +854,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } + /* + * Kill on action required. + */ + if (severity == MCE_AR_SEVERITY) + kill_it = 1; + if (m.status & MCI_STATUS_MISCV) m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) @@ -846,7 +868,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - severity = mce_severity(&m, tolerant, NULL); if (severity > worst) { *final = m; worst = severity; @@ -879,29 +900,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) * one task, do that. If the user has set the tolerance very * high, don't try to do anything at all. */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = final->ip && (final->cs & 3); - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", final, msg); - } - } + if (kill_it && tolerant < 3) + force_sig(SIGBUS, current); /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); @@ -1049,6 +1050,9 @@ static int mce_cap_init(void) if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; + if (cap & MCG_SER_P) + mce_ser = 1; + return 0; } -- cgit v1.2.3 From 4611a6fa4b37cf6b8b6066ed0d605c994c62a1a0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: export MCE severities coverage via debugfs The MCE severity judgement code is data-driven, so code coverage tools such as gcov can not be used for measuring coverage. Instead a dedicated coverage mechanism is implemented. The kernel keeps track of rules executed and reports them in debugfs. This is useful for increasing coverage of the mce-test testsuite. Right now it's unconditionally enabled because it's very little code. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 83 +++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4f4d2caf4043..ff0807f97056 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -10,6 +10,9 @@ * Author: Andi Kleen */ #include +#include +#include +#include #include #include "mce-internal.h" @@ -37,6 +40,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char covered; char *msg; } severities[] = { #define KERNEL .context = IN_KERNEL @@ -126,6 +130,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) continue; if (msg) *msg = s->msg; + s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { if (panic_on_oops || tolerant < 1) return MCE_PANIC_SEVERITY; @@ -133,3 +138,81 @@ int mce_severity(struct mce *a, int tolerant, char **msg) return s->sev; } } + +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce = NULL, *fseverities_coverage = NULL; + + dmce = debugfs_create_dir("mce", NULL); + if (dmce == NULL) + goto err_out; + fseverities_coverage = debugfs_create_file("severities-coverage", + 0444, dmce, NULL, + &severities_coverage_fops); + if (fseverities_coverage == NULL) + goto err_out; + + return 0; + +err_out: + if (fseverities_coverage) + debugfs_remove(fseverities_coverage); + if (dmce) + debugfs_remove(dmce); + return -ENOMEM; +} +late_initcall(severities_debugfs_init); -- cgit v1.2.3 From 4ef702c10b5df18ab04921fc252c26421d4d6c75 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86: fix panic with interrupts off (needed for MCE) For some time each panic() called with interrupts disabled triggered the !irqs_disabled() WARN_ON in smp_call_function(), producing ugly backtraces and confusing users. This is a common situation with machine checks for example which tend to call panic with interrupts disabled, but will also hit in other situations e.g. panic during early boot. In fact it means that panic cannot be called in many circumstances, which would be bad. This all started with the new fancy queued smp_call_function, which is then used by the shutdown path to shut down the other CPUs. On closer examination it turned out that the fancy RCU smp_call_function() does lots of things not suitable in a panic situation anyways, like allocating memory and relying on complex system state. I originally tried to patch this over by checking for panic there, but it was quite complicated and the original patch was also not very popular. This also didn't fix some of the underlying complexity problems. The new code in post 2.6.29 tries to patch around this by checking for oops_in_progress, but that is not enough to make this fully safe and I don't think that's a real solution because panic has to be reliable. So instead use an own vector to reboot. This makes the reboot code extremly straight forward, which is definitely a big plus in a panic situation where it is important to avoid relying on too much kernel state. The new simple code is also safe to be called from interupts off region because it is very very simple. There can be situations where it is important that panic is reliable. For example on a fatal machine check the panic is needed to get the system up again and running as quickly as possible. So it's important that panic is reliable and all function it calls simple. This is why I came up with this simple vector scheme. It's very hard to beat in simplicity. Vectors are not particularly precious anymore since all big systems are using per CPU vectors. Another possibility would have been to use an NMI similar to kdump, but there is still the problem that NMIs don't work reliably on some systems due to BIOS issues. NMIs would have been able to stop CPUs running with interrupts off too. In the sake of universal reliability I opted for using a non NMI vector for now. I put the reboot vector into the highest priority bucket of the APIC vectors and moved the 64bit UV_BAU message down instead into the next lower priority. [ Impact: bug fix, fixes an old regression ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 9 +++------ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irqinit.c | 3 +++ arch/x86/kernel/smp.c | 28 +++++++++++++++++++++++++++- 6 files changed, 37 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 4cdcf5a3c96b..69f886805ecb 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, smp_invalidate_interrupt) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 4e59197e29ba..1c8f28a63058 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,6 +45,7 @@ extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); extern void irq_move_cleanup_interrupt(void); +extern void reboot_interrupt(void); extern void threshold_interrupt(void); extern void call_function_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 68f7cf84a333..28477e4f2d49 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -88,12 +88,7 @@ #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa #define THRESHOLD_APIC_VECTOR 0xf9 - -#ifdef CONFIG_X86_32 -/* 0xf8 : free */ -#else -# define UV_BAU_MESSAGE 0xf8 -#endif +#define REBOOT_VECTOR 0xf8 /* f0-f7 used for spreading out TLB flushes: */ #define INVALIDATE_TLB_VECTOR_END 0xf7 @@ -117,6 +112,8 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +#define UV_BAU_MESSAGE 0xec + /* * Self IPI vector for machine checks */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 711c130a8411..4234b1235652 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -976,6 +976,8 @@ END(\sym) #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 441f6ec6e9d4..4a69ec55be3d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -173,6 +173,9 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + + /* IPI used for rebooting/stopping */ + alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); #endif #endif /* CONFIG_SMP */ } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f6db48c405b8..bf1831aa14fa 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) * this function calls the 'stop' function on all other CPUs in the system. */ +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + static void native_smp_send_stop(void) { unsigned long flags; + unsigned long wait; if (reboot_force) return; - smp_call_function(stop_this_cpu, NULL, 0); + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + apic->send_IPI_allbutself(REBOOT_VECTOR); + + /* Don't wait longer than a second */ + wait = USEC_PER_SEC; + while (num_online_cpus() > 1 && wait--) + udelay(1); + } + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); -- cgit v1.2.3 From 9ff36ee9668ff41ec3274597c730524645929b0f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: rename mce_notify_user to mce_notify_irq Rename the mce_notify_user function to mce_notify_irq. The next patch will split the wakeup handling of interrupt context and of process context and it's better to give it a clearer name for this. Contains a fix from Ying Huang [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/signal.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index afd3cdf6f8ad..713926b62cbb 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -159,7 +159,7 @@ enum mcp_flags { }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -int mce_notify_user(void); +int mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7d858fb4ce67..a3a235a53f09 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -65,7 +65,7 @@ static void raise_mce(unsigned long data) memset(&b, 0xff, sizeof(mce_banks_t)); printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); machine_check_poll(0, &b); - mce_notify_user(); + mce_notify_irq(); printk(KERN_INFO "Finished machine check poll on CPU %d\n", cpu); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f051a7807ab4..13e1b7ffe73a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -348,7 +348,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) ack_APIC_irq(); exit_idle(); irq_enter(); - mce_notify_user(); + mce_notify_irq(); irq_exit(); } #endif @@ -356,7 +356,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_user(); + mce_notify_irq(); return; } @@ -968,7 +968,7 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) + if (mce_notify_irq()) *n = max(*n/2, HZ/100); else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); @@ -989,7 +989,7 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); * Can be called from interrupt context, but not from machine check/NMI * context. */ -int mce_notify_user(void) +int mce_notify_irq(void) { /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); @@ -1014,7 +1014,7 @@ int mce_notify_user(void) } return 0; } -EXPORT_SYMBOL_GPL(mce_notify_user); +EXPORT_SYMBOL_GPL(mce_notify_irq); /* * Initialize Machine Checks for a CPU. diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index eff3740501a3..b7c5a2470b40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -80,7 +80,7 @@ static int cmci_supported(int *banks) static void intel_threshold_interrupt(void) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_user(); + mce_notify_irq(); } static void print_update(char *type, int *hdr, int num) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d0851e3f77eb..d5dc15bce005 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); + mce_notify_irq(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ -- cgit v1.2.3 From 8fa8dd9e3aafb7b440b7d54219891615abc6390e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: define MCE_VECTOR Add MCE_VECTOR for the #MC exception. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 28477e4f2d49..1b35c4357ea8 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -25,6 +25,7 @@ */ #define NMI_VECTOR 0x02 +#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start -- cgit v1.2.3 From 9b1beaf2b551a8a1604f104025b24e9c535c8963 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:59 +0200 Subject: x86, mce: support action-optional machine checks Newer Intel CPUs support a new class of machine checks called recoverable action optional. Action Optional means that the CPU detected some form of corruption in the background and tells the OS about using a machine check exception. The OS can then take appropiate action, like killing the process with the corrupted data or logging the event properly to disk. This is done by the new generic high level memory failure handler added in a earlier patch. The high level handler takes the address with the failed memory and does the appropiate action, like killing the process. In this version of the patch the high level handler is stubbed out with a weak function to not create a direct dependency on the hwpoison branch. The high level handler cannot be directly called from the machine check exception though, because it has to run in a defined process context to be able to sleep when taking VM locks (it is not expected to sleep for a long time, just do so in some exceptional cases like lock contention) Thus the MCE handler has to queue a work item for process context, trigger process context and then call the high level handler from there. This patch adds two path to process context: through a per thread kernel exit notify_user() callback or through a high priority work item. The first runs when the process exits back to user space, the other when it goes to sleep and there is no higher priority process. The machine check handler will schedule both, and whoever runs first will grab the event. This is done because quick reaction to this event is critical to avoid a potential more fatal machine check when the corruption is consumed. There is a simple lock less ring buffer to queue the corrupted addresses between the exception handler and the process context handler. Then in process context it just calls the high level VM code with the corrupted PFNs. The code adds the required code to extract the failed address from the CPU's machine check registers. It doesn't try to handle all possible cases -- the specification has 6 different ways to specify memory address -- but only the linear address. Most of the required checking has been already done earlier in the mce_severity rule checking engine. Following the Intel recommendations Action Optional errors are only enabled for known situations (encoded in MCACODs). The errors are ignored otherwise, because they are action optional. v2: Improve comment, disable preemption while processing ring buffer (reported by Ying Huang) Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 133 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/signal.c | 2 +- 3 files changed, 135 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 713926b62cbb..82978ad12072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -160,6 +160,7 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); +void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73a..d4e7b5947a0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,8 @@ static inline int skip_bank_init(int i) return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); } +static DEFINE_PER_CPU(struct work_struct, mce_work); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v) wrmsrl(msr, v); } +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16 /* we use one entry less */ + +struct mce_ring { + unsigned short start; + unsigned short end; + unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + + return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ + struct mce_ring *r; + int ret = 0; + + *pfn = 0; + get_cpu(); + r = &__get_cpu_var(mce_ring); + if (r->start == r->end) + goto out; + *pfn = r->ring[r->start]; + r->start = (r->start + 1) % MCE_RING_SIZE; + ret = 1; +out: + put_cpu(); + return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + unsigned next; + + next = (r->end + 1) % MCE_RING_SIZE; + if (next == r->start) + return -1; + r->ring[r->end] = pfn; + wmb(); + r->end = next; + return 0; +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +static void mce_schedule_work(void) +{ + if (!mce_ring_empty()) { + struct work_struct *work = &__get_cpu_var(mce_work); + if (!work_pending(work)) + schedule_work(work); + } +} + /* * Get the address of the instruction at the time of the machine check * error. @@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); mce_notify_irq(); + mce_schedule_work(); irq_exit(); } #endif @@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); return; } @@ -731,6 +806,23 @@ reset: return ret; } +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + if ((m->misc & 0x3f) > PAGE_SHIFT) + return 0; + if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + return 0; + return 1; +} + static void mce_clear_state(unsigned long *toclear) { int i; @@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (m.status & MCI_STATUS_ADDRV) m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + /* + * Action optional error. Queue address for later processing. + * When the ring overflows we just ignore the AO error. + * RED-PEN add some logging mechanism when + * usable_address or mce_add_ring fails. + * RED-PEN don't ignore overflow for tolerant == 0 + */ + if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_get_rip(&m, regs); mce_log(&m); @@ -916,6 +1018,36 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ + printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ + unsigned long pfn; + mce_notify_irq(); + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ + mce_notify_process(); +} + #ifdef CONFIG_X86_MCE_INTEL /*** * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog @@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(); mce_cpu_features(c); mce_init_timer(); + INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); } /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5dc15bce005..4976888094f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_irq(); + mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ -- cgit v1.2.3 From 8051dbd2dfd1427cc102888d7d96bf39de0be150 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 2 Jun 2009 16:53:23 +0900 Subject: x86, mce: fix for mce counters Make the MCE counters work on 32bit and add poll count in arch_irq_stat_cpu. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index eff46b5de62f..9773395aa758 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -95,7 +95,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_NEW_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -172,9 +172,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - sum += per_cpu(mce_exception_count, cpu); -#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; @@ -191,6 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) # ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; # endif +#endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } -- cgit v1.2.3 From 77e26cca20013e9352a8df86a54640543304a23a Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 11 Jun 2009 16:04:35 +0900 Subject: x86, mce: Fix mce printing This patch: - Adds print_mce_head() instead of first flag - Makes the header to be printed always - Stops double printing of corrected errors [ This portion originates from Huang Ying's patch ] Originally-From: Huang Ying Signed-off-by: Hidetoshi Seto LKML-Reference: <4A30AC83.5010708@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d4e7b5947a0e..6a3127ecb5cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -180,12 +180,8 @@ void mce_log(struct mce *mce) set_bit(0, ¬ify_user); } -static void print_mce(struct mce *m, int *first) +static void print_mce(struct mce *m) { - if (*first) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); - *first = 0; - } printk(KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -208,6 +204,11 @@ static void print_mce(struct mce *m, int *first) m->apicid); } +static void print_mce_head(void) +{ + printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); +} + static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" @@ -234,7 +235,6 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - int first = 1; /* * Make sure only one CPU runs in machine check panic @@ -245,23 +245,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp) bust_spinlocks(1); console_verbose(); + print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; - if ((m->status & MCI_STATUS_VAL) && - !(m->status & MCI_STATUS_UC)) - print_mce(m, &first); + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) + print_mce(m); } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; + if (!(m->status & MCI_STATUS_UC)) + continue; if (!final || memcmp(m, final, sizeof(struct mce))) - print_mce(m, &first); + print_mce(m); } if (final) - print_mce(final, &first); + print_mce(final); if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); print_mce_tail(); -- cgit v1.2.3 From 62fdac5913f71f8f200bd2c9bd59a02e9a1498e9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 11 Jun 2009 16:06:07 +0900 Subject: x86, mce: Add boot options for corrected errors This patch introduces three boot options (no_cmci, dont_log_ce and ignore_ce) to control handling for corrected errors. The "mce=no_cmci" boot option disables the CMCI feature. Since CMCI is a new feature so having boot controls to disable it will be a help if the hardware is misbehaving. The "mce=dont_log_ce" boot option disables logging for corrected errors. All reported corrected errors will be cleared silently. This option will be useful if you never care about corrected errors. The "mce=ignore_ce" boot option disables features for corrected errors, i.e. polling timer and cmci. All corrected events are not cleared and kept in bank MSRs. Usually this disablement is not recommended, however it will be a help if there are some conflict with the BIOS or hardware monitoring applications etc., that clears corrected events in banks instead of OS. [ And trivial cleanup (space -> tab) for doc is included. ] Signed-off-by: Hidetoshi Seto Reviewed-by: Andi Kleen LKML-Reference: <4A30ACDF.5030408@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 36 +++++++++++++++++++++++++------ arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/mce.c | 19 ++++++++++++++-- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 3 +++ 4 files changed, 52 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 0ee5e3b212f3..fa2bed07d21e 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -7,12 +7,36 @@ Machine check Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables. - mce=off disable machine check - mce=bootlog Enable logging of machine checks left over from booting. - Disabled by default on AMD because some BIOS leave bogus ones. - If your BIOS doesn't do that it's a good idea to enable though - to make sure you log even machine check events that result - in a reboot. On Intel systems it is enabled by default. + mce=off + Disable machine check + mce=no_cmci + Disable CMCI(Corrected Machine Check Interrupt) that + Intel processor supports. Usually this disablement is + not recommended, but it might be handy if your hardware + is misbehaving. + Note that you'll get more problems without CMCI than with + due to the shared banks, i.e. you might get duplicated + error logs. + mce=dont_log_ce + Don't make logs for corrected errors. All events reported + as corrected are silently cleared by OS. + This option will be useful if you have no interest in any + of corrected errors. + mce=ignore_ce + Disable features for corrected errors, e.g. polling timer + and CMCI. All events reported as corrected are not cleared + by OS and remained in its error banks. + Usually this disablement is not recommended, however if + there is an agent checking/clearing corrected errors + (e.g. BIOS or hardware monitoring applications), conflicting + with OS's error handling, and you cannot deactivate the agent, + then this option will be a help. + mce=bootlog + Enable logging of machine checks left over from booting. + Disabled by default on AMD because some BIOS leave bogus ones. + If your BIOS doesn't do that it's a good idea to enable though + to make sure you log even machine check events that result + in a reboot. On Intel systems it is enabled by default. mce=nobootlog Disable boot machine check logging. mce=tolerancelevel[,monarchtimeout] (number,number) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 82978ad12072..540a466e50f5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -119,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) #ifdef CONFIG_X86_MCE_INTEL +extern int mce_cmci_disabled; +extern int mce_ignore_ce; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6a3127ecb5cc..fabba15e4558 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -84,6 +84,9 @@ static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; static int mce_panic_timeout; +static int mce_dont_log_ce; +int mce_cmci_disabled; +int mce_ignore_ce; int mce_ser; static char trigger[128]; @@ -526,7 +529,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG)) { + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); add_taint(TAINT_MACHINE_CHECK); } @@ -1307,6 +1310,9 @@ static void mce_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(next_interval); + if (mce_ignore_ce) + return; + *n = check_interval * HZ; if (!*n) return; @@ -1517,7 +1523,10 @@ static struct miscdevice mce_log_device = { }; /* - * mce=off disables machine check + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait @@ -1532,6 +1541,12 @@ static int __init mcheck_enable(char *str) str++; if (!strcmp(str, "off")) mce_disabled = 1; + else if (!strcmp(str, "no_cmci")) + mce_cmci_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + mce_dont_log_ce = 1; + else if (!strcmp(str, "ignore_ce")) + mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index b7c5a2470b40..046087e9808f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -57,6 +57,9 @@ static int cmci_supported(int *banks) { u64 cap; + if (mce_cmci_disabled || mce_ignore_ce) + return 0; + /* * Vendor check is not strictly needed, but the initial * initialization is vendor keyed and this -- cgit v1.2.3 From 63b852a6b67d0820d388b0ecd0da83ccb4048b8d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 May 2009 22:56:24 +0000 Subject: asm-generic: rename termios.h, signal.h and mman.h The existing asm-generic versions are incomplete and included by some architectures. New architectures should be able to use a generic version, so rename the existing files and change all users, which lets us add the new files. Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann --- arch/alpha/include/asm/signal.h | 2 +- arch/arm/include/asm/mman.h | 2 +- arch/arm/include/asm/signal.h | 2 +- arch/avr32/include/asm/mman.h | 2 +- arch/avr32/include/asm/signal.h | 2 +- arch/avr32/include/asm/termios.h | 2 +- arch/blackfin/include/asm/signal.h | 2 +- arch/cris/include/asm/mman.h | 2 +- arch/cris/include/asm/signal.h | 2 +- arch/frv/include/asm/mman.h | 2 +- arch/frv/include/asm/termios.h | 2 +- arch/h8300/include/asm/mman.h | 2 +- arch/h8300/include/asm/signal.h | 2 +- arch/ia64/include/asm/mman.h | 2 +- arch/ia64/include/asm/signal.h | 2 +- arch/m32r/include/asm/mman.h | 2 +- arch/m32r/include/asm/signal.h | 2 +- arch/m68k/include/asm/mman.h | 2 +- arch/m68k/include/asm/signal.h | 2 +- arch/microblaze/include/asm/signal.h | 2 +- arch/microblaze/include/asm/termios.h | 2 +- arch/mips/include/asm/signal.h | 2 +- arch/mn10300/include/asm/mman.h | 2 +- arch/mn10300/include/asm/signal.h | 2 +- arch/powerpc/include/asm/mman.h | 2 +- arch/powerpc/include/asm/signal.h | 2 +- arch/powerpc/include/asm/termios.h | 2 +- arch/s390/include/asm/mman.h | 2 +- arch/s390/include/asm/signal.h | 2 +- arch/s390/include/asm/termios.h | 2 +- arch/sh/include/asm/mman.h | 2 +- arch/sh/include/asm/signal.h | 2 +- arch/sparc/include/asm/mman.h | 2 +- arch/sparc/include/asm/signal.h | 2 +- arch/x86/include/asm/mman.h | 2 +- arch/x86/include/asm/signal.h | 2 +- include/asm-generic/Kbuild | 4 +- include/asm-generic/mman-common.h | 41 +++++++++++++++++++ include/asm-generic/mman.h | 41 ------------------- include/asm-generic/signal-defs.h | 28 +++++++++++++ include/asm-generic/signal.h | 28 ------------- include/asm-generic/termios-base.h | 77 +++++++++++++++++++++++++++++++++++ include/asm-generic/termios.h | 77 ----------------------------------- 43 files changed, 184 insertions(+), 184 deletions(-) create mode 100644 include/asm-generic/mman-common.h delete mode 100644 include/asm-generic/mman.h create mode 100644 include/asm-generic/signal-defs.h delete mode 100644 include/asm-generic/signal.h create mode 100644 include/asm-generic/termios-base.h delete mode 100644 include/asm-generic/termios.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/signal.h b/arch/alpha/include/asm/signal.h index 13c2305d35ef..a9388300abb1 100644 --- a/arch/alpha/include/asm/signal.h +++ b/arch/alpha/include/asm/signal.h @@ -111,7 +111,7 @@ typedef unsigned long sigset_t; #define SIG_UNBLOCK 2 /* for unblocking signals */ #define SIG_SETMASK 3 /* for setting the signal mask */ -#include +#include #ifdef __KERNEL__ struct osf_sigaction { diff --git a/arch/arm/include/asm/mman.h b/arch/arm/include/asm/mman.h index 54570d2e95b7..fc26976d8e3a 100644 --- a/arch/arm/include/asm/mman.h +++ b/arch/arm/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __ARM_MMAN_H__ #define __ARM_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/arm/include/asm/signal.h b/arch/arm/include/asm/signal.h index d0fb487aba4f..43ba0fb1c8ad 100644 --- a/arch/arm/include/asm/signal.h +++ b/arch/arm/include/asm/signal.h @@ -111,7 +111,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/avr32/include/asm/mman.h b/arch/avr32/include/asm/mman.h index 648f91e7187a..9a92b15f6a66 100644 --- a/arch/avr32/include/asm/mman.h +++ b/arch/avr32/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __ASM_AVR32_MMAN_H__ #define __ASM_AVR32_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/avr32/include/asm/signal.h b/arch/avr32/include/asm/signal.h index caffefeeba1f..8790dfc10d5b 100644 --- a/arch/avr32/include/asm/signal.h +++ b/arch/avr32/include/asm/signal.h @@ -112,7 +112,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/avr32/include/asm/termios.h b/arch/avr32/include/asm/termios.h index 0152aba35154..dd7e9da25488 100644 --- a/arch/avr32/include/asm/termios.h +++ b/arch/avr32/include/asm/termios.h @@ -55,7 +55,7 @@ struct termio { */ #define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" -#include +#include #endif /* __KERNEL__ */ diff --git a/arch/blackfin/include/asm/signal.h b/arch/blackfin/include/asm/signal.h index 87951d251458..2eea90794454 100644 --- a/arch/blackfin/include/asm/signal.h +++ b/arch/blackfin/include/asm/signal.h @@ -104,7 +104,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/cris/include/asm/mman.h b/arch/cris/include/asm/mman.h index 1c35e1b66b46..b7f0afba3ce0 100644 --- a/arch/cris/include/asm/mman.h +++ b/arch/cris/include/asm/mman.h @@ -3,7 +3,7 @@ /* verbatim copy of asm-i386/ version */ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/cris/include/asm/signal.h b/arch/cris/include/asm/signal.h index 349ae682b568..ea6af9aad76c 100644 --- a/arch/cris/include/asm/signal.h +++ b/arch/cris/include/asm/signal.h @@ -106,7 +106,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/frv/include/asm/mman.h b/arch/frv/include/asm/mman.h index b4371e928683..58c1d11e2ac7 100644 --- a/arch/frv/include/asm/mman.h +++ b/arch/frv/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __ASM_MMAN_H__ #define __ASM_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/frv/include/asm/termios.h b/arch/frv/include/asm/termios.h index a62fb5872375..b4868aafe79c 100644 --- a/arch/frv/include/asm/termios.h +++ b/arch/frv/include/asm/termios.h @@ -52,7 +52,7 @@ struct termio { /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ #ifdef __KERNEL__ -#include +#include #endif #endif /* _ASM_TERMIOS_H */ diff --git a/arch/h8300/include/asm/mman.h b/arch/h8300/include/asm/mman.h index b9f104f22a36..cf35f0a6f12e 100644 --- a/arch/h8300/include/asm/mman.h +++ b/arch/h8300/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __H8300_MMAN_H__ #define __H8300_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/h8300/include/asm/signal.h b/arch/h8300/include/asm/signal.h index 7bc15048a64f..fd8b66e40dca 100644 --- a/arch/h8300/include/asm/signal.h +++ b/arch/h8300/include/asm/signal.h @@ -105,7 +105,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/ia64/include/asm/mman.h b/arch/ia64/include/asm/mman.h index c73b87832a1e..48cf8b98a0b4 100644 --- a/arch/ia64/include/asm/mman.h +++ b/arch/ia64/include/asm/mman.h @@ -8,7 +8,7 @@ * David Mosberger-Tang , Hewlett-Packard Co */ -#include +#include #define MAP_GROWSDOWN 0x00100 /* stack-like segment */ #define MAP_GROWSUP 0x00200 /* register stack-like segment */ diff --git a/arch/ia64/include/asm/signal.h b/arch/ia64/include/asm/signal.h index 4f5ca5643cb1..b166248d49a4 100644 --- a/arch/ia64/include/asm/signal.h +++ b/arch/ia64/include/asm/signal.h @@ -114,7 +114,7 @@ #endif /* __KERNEL__ */ -#include +#include # ifndef __ASSEMBLY__ diff --git a/arch/m32r/include/asm/mman.h b/arch/m32r/include/asm/mman.h index 516a8973b130..04a5f40aa401 100644 --- a/arch/m32r/include/asm/mman.h +++ b/arch/m32r/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __M32R_MMAN_H__ #define __M32R_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/m32r/include/asm/signal.h b/arch/m32r/include/asm/signal.h index 1a607066bc64..9c1acb2b1a92 100644 --- a/arch/m32r/include/asm/signal.h +++ b/arch/m32r/include/asm/signal.h @@ -107,7 +107,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/m68k/include/asm/mman.h b/arch/m68k/include/asm/mman.h index 1626d37f4898..9f5c4c4b3c7b 100644 --- a/arch/m68k/include/asm/mman.h +++ b/arch/m68k/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __M68K_MMAN_H__ #define __M68K_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h index 08788fdefde0..5bc09c787a11 100644 --- a/arch/m68k/include/asm/signal.h +++ b/arch/m68k/include/asm/signal.h @@ -103,7 +103,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/microblaze/include/asm/signal.h b/arch/microblaze/include/asm/signal.h index 9676fad3486c..46bc2267d949 100644 --- a/arch/microblaze/include/asm/signal.h +++ b/arch/microblaze/include/asm/signal.h @@ -90,7 +90,7 @@ # ifndef __ASSEMBLY__ # include -# include +# include /* Avoid too many header ordering problems. */ struct siginfo; diff --git a/arch/microblaze/include/asm/termios.h b/arch/microblaze/include/asm/termios.h index 102d77258668..47a46d1fbe26 100644 --- a/arch/microblaze/include/asm/termios.h +++ b/arch/microblaze/include/asm/termios.h @@ -81,7 +81,7 @@ struct termio { #ifdef __KERNEL__ -#include +#include #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/signal.h b/arch/mips/include/asm/signal.h index bee5153aca48..c783f364938c 100644 --- a/arch/mips/include/asm/signal.h +++ b/arch/mips/include/asm/signal.h @@ -109,7 +109,7 @@ typedef unsigned long old_sigset_t; /* at least 32 bits */ #define SIG_UNBLOCK 2 /* for unblocking signals */ #define SIG_SETMASK 3 /* for setting the signal mask */ -#include +#include struct sigaction { unsigned int sa_flags; diff --git a/arch/mn10300/include/asm/mman.h b/arch/mn10300/include/asm/mman.h index b7986b65addf..d04fac1da5aa 100644 --- a/arch/mn10300/include/asm/mman.h +++ b/arch/mn10300/include/asm/mman.h @@ -12,7 +12,7 @@ #ifndef _ASM_MMAN_H #define _ASM_MMAN_H -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/mn10300/include/asm/signal.h b/arch/mn10300/include/asm/signal.h index e98817cec5f7..7e891fce2370 100644 --- a/arch/mn10300/include/asm/signal.h +++ b/arch/mn10300/include/asm/signal.h @@ -115,7 +115,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index e7b99bac9f48..7b1c49811a24 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef _ASM_POWERPC_MMAN_H #define _ASM_POWERPC_MMAN_H -#include +#include /* * This program is free software; you can redistribute it and/or diff --git a/arch/powerpc/include/asm/signal.h b/arch/powerpc/include/asm/signal.h index 69f709d8e8e7..3eb13be11d8f 100644 --- a/arch/powerpc/include/asm/signal.h +++ b/arch/powerpc/include/asm/signal.h @@ -94,7 +94,7 @@ typedef struct { #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include struct old_sigaction { __sighandler_t sa_handler; diff --git a/arch/powerpc/include/asm/termios.h b/arch/powerpc/include/asm/termios.h index 2c14fea07c8a..a24f48704a34 100644 --- a/arch/powerpc/include/asm/termios.h +++ b/arch/powerpc/include/asm/termios.h @@ -78,7 +78,7 @@ struct termio { #ifdef __KERNEL__ -#include +#include #endif /* __KERNEL__ */ diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h index da01432e8f44..f63fe7b431ed 100644 --- a/arch/s390/include/asm/mman.h +++ b/arch/s390/include/asm/mman.h @@ -9,7 +9,7 @@ #ifndef __S390_MMAN_H__ #define __S390_MMAN_H__ -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/s390/include/asm/signal.h b/arch/s390/include/asm/signal.h index f6cfddb278cb..cdf5cb2fe03f 100644 --- a/arch/s390/include/asm/signal.h +++ b/arch/s390/include/asm/signal.h @@ -115,7 +115,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h index 67f66278f533..bc3a35cefc96 100644 --- a/arch/s390/include/asm/termios.h +++ b/arch/s390/include/asm/termios.h @@ -60,7 +60,7 @@ struct termio { #define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2)) #define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2)) -#include +#include #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/mman.h b/arch/sh/include/asm/mman.h index 156eb0225cf6..7d8b72c91a5f 100644 --- a/arch/sh/include/asm/mman.h +++ b/arch/sh/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __ASM_SH_MMAN_H #define __ASM_SH_MMAN_H -#include +#include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ diff --git a/arch/sh/include/asm/signal.h b/arch/sh/include/asm/signal.h index 5c5c1e852089..9cc5f0144689 100644 --- a/arch/sh/include/asm/signal.h +++ b/arch/sh/include/asm/signal.h @@ -106,7 +106,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifdef __KERNEL__ struct old_sigaction { diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h index fdfbbf0a4736..988192e8e956 100644 --- a/arch/sparc/include/asm/mman.h +++ b/arch/sparc/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef __SPARC_MMAN_H__ #define __SPARC_MMAN_H__ -#include +#include /* SunOS'ified... */ diff --git a/arch/sparc/include/asm/signal.h b/arch/sparc/include/asm/signal.h index cba45206b7f2..e49b828a2471 100644 --- a/arch/sparc/include/asm/signal.h +++ b/arch/sparc/include/asm/signal.h @@ -176,7 +176,7 @@ struct sigstack { #define SA_STATIC_ALLOC 0x8000 #endif -#include +#include struct __new_sigaction { __sighandler_t sa_handler; diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 90bc4108a4fd..751af2550ed9 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include +#include #define MAP_32BIT 0x40 /* only give out 32bit addresses */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 7761a5d554bb..598457cbd0f8 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -117,7 +117,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include +#include #ifndef __ASSEMBLY__ diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 4c9932a2503f..460b08d51e2e 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -2,9 +2,9 @@ header-y += errno-base.h header-y += errno.h header-y += fcntl.h header-y += ioctl.h -header-y += mman.h +header-y += mman-common.h header-y += poll.h -header-y += signal.h +header-y += signal-defs.h header-y += statfs.h unifdef-y += int-l64.h diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h new file mode 100644 index 000000000000..3b69ad34189a --- /dev/null +++ b/include/asm-generic/mman-common.h @@ -0,0 +1,41 @@ +#ifndef __ASM_GENERIC_MMAN_COMMON_H +#define __ASM_GENERIC_MMAN_COMMON_H + +/* + Author: Michael S. Tsirkin , Mellanox Technologies Ltd. + Based on: asm-xxx/mman.h +*/ + +#define PROT_READ 0x1 /* page can be read */ +#define PROT_WRITE 0x2 /* page can be written */ +#define PROT_EXEC 0x4 /* page can be executed */ +#define PROT_SEM 0x8 /* page may be used for atomic ops */ +#define PROT_NONE 0x0 /* page can not be accessed */ +#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ +#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ + +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_TYPE 0x0f /* Mask for type of mapping */ +#define MAP_FIXED 0x10 /* Interpret addr exactly */ +#define MAP_ANONYMOUS 0x20 /* don't use a file */ + +#define MS_ASYNC 1 /* sync memory asynchronously */ +#define MS_INVALIDATE 2 /* invalidate the caches */ +#define MS_SYNC 4 /* synchronous memory sync */ + +#define MADV_NORMAL 0 /* no further special treatment */ +#define MADV_RANDOM 1 /* expect random page references */ +#define MADV_SEQUENTIAL 2 /* expect sequential page references */ +#define MADV_WILLNEED 3 /* will need these pages */ +#define MADV_DONTNEED 4 /* don't need these pages */ + +/* common parameters: try to keep these consistent across architectures */ +#define MADV_REMOVE 9 /* remove these pages & resources */ +#define MADV_DONTFORK 10 /* don't inherit across fork */ +#define MADV_DOFORK 11 /* do inherit across fork */ + +/* compatibility flags */ +#define MAP_FILE 0 + +#endif /* __ASM_GENERIC_MMAN_COMMON_H */ diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h deleted file mode 100644 index 5e3dde2ee5ad..000000000000 --- a/include/asm-generic/mman.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _ASM_GENERIC_MMAN_H -#define _ASM_GENERIC_MMAN_H - -/* - Author: Michael S. Tsirkin , Mellanox Technologies Ltd. - Based on: asm-xxx/mman.h -*/ - -#define PROT_READ 0x1 /* page can be read */ -#define PROT_WRITE 0x2 /* page can be written */ -#define PROT_EXEC 0x4 /* page can be executed */ -#define PROT_SEM 0x8 /* page may be used for atomic ops */ -#define PROT_NONE 0x0 /* page can not be accessed */ -#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ -#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ - -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_TYPE 0x0f /* Mask for type of mapping */ -#define MAP_FIXED 0x10 /* Interpret addr exactly */ -#define MAP_ANONYMOUS 0x20 /* don't use a file */ - -#define MS_ASYNC 1 /* sync memory asynchronously */ -#define MS_INVALIDATE 2 /* invalidate the caches */ -#define MS_SYNC 4 /* synchronous memory sync */ - -#define MADV_NORMAL 0 /* no further special treatment */ -#define MADV_RANDOM 1 /* expect random page references */ -#define MADV_SEQUENTIAL 2 /* expect sequential page references */ -#define MADV_WILLNEED 3 /* will need these pages */ -#define MADV_DONTNEED 4 /* don't need these pages */ - -/* common parameters: try to keep these consistent across architectures */ -#define MADV_REMOVE 9 /* remove these pages & resources */ -#define MADV_DONTFORK 10 /* don't inherit across fork */ -#define MADV_DOFORK 11 /* do inherit across fork */ - -/* compatibility flags */ -#define MAP_FILE 0 - -#endif diff --git a/include/asm-generic/signal-defs.h b/include/asm-generic/signal-defs.h new file mode 100644 index 000000000000..00f95df54297 --- /dev/null +++ b/include/asm-generic/signal-defs.h @@ -0,0 +1,28 @@ +#ifndef __ASM_GENERIC_SIGNAL_DEFS_H +#define __ASM_GENERIC_SIGNAL_DEFS_H + +#include + +#ifndef SIG_BLOCK +#define SIG_BLOCK 0 /* for blocking signals */ +#endif +#ifndef SIG_UNBLOCK +#define SIG_UNBLOCK 1 /* for unblocking signals */ +#endif +#ifndef SIG_SETMASK +#define SIG_SETMASK 2 /* for setting the signal mask */ +#endif + +#ifndef __ASSEMBLY__ +typedef void __signalfn_t(int); +typedef __signalfn_t __user *__sighandler_t; + +typedef void __restorefn_t(void); +typedef __restorefn_t __user *__sigrestore_t; + +#define SIG_DFL ((__force __sighandler_t)0) /* default signal handling */ +#define SIG_IGN ((__force __sighandler_t)1) /* ignore signal */ +#define SIG_ERR ((__force __sighandler_t)-1) /* error return from signal */ +#endif + +#endif /* __ASM_GENERIC_SIGNAL_DEFS_H */ diff --git a/include/asm-generic/signal.h b/include/asm-generic/signal.h deleted file mode 100644 index dae1d8720076..000000000000 --- a/include/asm-generic/signal.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __ASM_GENERIC_SIGNAL_H -#define __ASM_GENERIC_SIGNAL_H - -#include - -#ifndef SIG_BLOCK -#define SIG_BLOCK 0 /* for blocking signals */ -#endif -#ifndef SIG_UNBLOCK -#define SIG_UNBLOCK 1 /* for unblocking signals */ -#endif -#ifndef SIG_SETMASK -#define SIG_SETMASK 2 /* for setting the signal mask */ -#endif - -#ifndef __ASSEMBLY__ -typedef void __signalfn_t(int); -typedef __signalfn_t __user *__sighandler_t; - -typedef void __restorefn_t(void); -typedef __restorefn_t __user *__sigrestore_t; - -#define SIG_DFL ((__force __sighandler_t)0) /* default signal handling */ -#define SIG_IGN ((__force __sighandler_t)1) /* ignore signal */ -#define SIG_ERR ((__force __sighandler_t)-1) /* error return from signal */ -#endif - -#endif /* __ASM_GENERIC_SIGNAL_H */ diff --git a/include/asm-generic/termios-base.h b/include/asm-generic/termios-base.h new file mode 100644 index 000000000000..0a769feb22b0 --- /dev/null +++ b/include/asm-generic/termios-base.h @@ -0,0 +1,77 @@ +/* termios.h: generic termios/termio user copying/translation + */ + +#ifndef _ASM_GENERIC_TERMIOS_BASE_H +#define _ASM_GENERIC_TERMIOS_BASE_H + +#include + +#ifndef __ARCH_TERMIO_GETPUT + +/* + * Translate a "termio" structure into a "termios". Ugh. + */ +static inline int user_termio_to_kernel_termios(struct ktermios *termios, + struct termio __user *termio) +{ + unsigned short tmp; + + if (get_user(tmp, &termio->c_iflag) < 0) + goto fault; + termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; + + if (get_user(tmp, &termio->c_oflag) < 0) + goto fault; + termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; + + if (get_user(tmp, &termio->c_cflag) < 0) + goto fault; + termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; + + if (get_user(tmp, &termio->c_lflag) < 0) + goto fault; + termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; + + if (get_user(termios->c_line, &termio->c_line) < 0) + goto fault; + + if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) + goto fault; + + return 0; + + fault: + return -EFAULT; +} + +/* + * Translate a "termios" structure into a "termio". Ugh. + */ +static inline int kernel_termios_to_user_termio(struct termio __user *termio, + struct ktermios *termios) +{ + if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || + put_user(termios->c_oflag, &termio->c_oflag) < 0 || + put_user(termios->c_cflag, &termio->c_cflag) < 0 || + put_user(termios->c_lflag, &termio->c_lflag) < 0 || + put_user(termios->c_line, &termio->c_line) < 0 || + copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) + return -EFAULT; + + return 0; +} + +#ifndef user_termios_to_kernel_termios +#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios)) +#endif + +#ifndef kernel_termios_to_user_termios +#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios)) +#endif + +#define user_termios_to_kernel_termios_1(k, u) copy_from_user(k, u, sizeof(struct termios)) +#define kernel_termios_to_user_termios_1(u, k) copy_to_user(u, k, sizeof(struct termios)) + +#endif /* __ARCH_TERMIO_GETPUT */ + +#endif /* _ASM_GENERIC_TERMIOS_BASE_H */ diff --git a/include/asm-generic/termios.h b/include/asm-generic/termios.h deleted file mode 100644 index 7d39ecc92d94..000000000000 --- a/include/asm-generic/termios.h +++ /dev/null @@ -1,77 +0,0 @@ -/* termios.h: generic termios/termio user copying/translation - */ - -#ifndef _ASM_GENERIC_TERMIOS_H -#define _ASM_GENERIC_TERMIOS_H - -#include - -#ifndef __ARCH_TERMIO_GETPUT - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) -{ - unsigned short tmp; - - if (get_user(tmp, &termio->c_iflag) < 0) - goto fault; - termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; - - if (get_user(tmp, &termio->c_oflag) < 0) - goto fault; - termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; - - if (get_user(tmp, &termio->c_cflag) < 0) - goto fault; - termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; - - if (get_user(tmp, &termio->c_lflag) < 0) - goto fault; - termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; - - if (get_user(termios->c_line, &termio->c_line) < 0) - goto fault; - - if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) - goto fault; - - return 0; - - fault: - return -EFAULT; -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || - put_user(termios->c_oflag, &termio->c_oflag) < 0 || - put_user(termios->c_cflag, &termio->c_cflag) < 0 || - put_user(termios->c_lflag, &termio->c_lflag) < 0 || - put_user(termios->c_line, &termio->c_line) < 0 || - copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) - return -EFAULT; - - return 0; -} - -#ifndef user_termios_to_kernel_termios -#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios)) -#endif - -#ifndef kernel_termios_to_user_termios -#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios)) -#endif - -#define user_termios_to_kernel_termios_1(k, u) copy_from_user(k, u, sizeof(struct termios)) -#define kernel_termios_to_user_termios_1(u, k) copy_to_user(u, k, sizeof(struct termios)) - -#endif /* __ARCH_TERMIO_GETPUT */ - -#endif /* _ASM_GENERIC_TERMIOS_H */ -- cgit v1.2.3 From c31ae4bb4a9fa4606a74c0a4fb61b74f804e861e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 May 2009 22:56:25 +0000 Subject: asm-generic: introduce asm/bitsperlong.h This provides a reliable way for asm-generic/types.h and other files to find out if it is running on a 32 or 64 bit platform. We cannot use CONFIG_64BIT for this in headers that are included from user space because CONFIG symbols are not available there. We also cannot do it inside of asm/types.h because some headers need the word size but cannot include types.h. The solution is to introduce a new header that defines both __BITS_PER_LONG for user space and BITS_PER_LONG for usage in the kernel. The asm-generic version falls back to 32 bit unless the architecture overrides it, which I did for all 64 bit platforms. Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann --- arch/alpha/include/asm/bitsperlong.h | 8 ++++++++ arch/alpha/include/asm/types.h | 3 --- arch/arm/include/asm/bitsperlong.h | 1 + arch/avr32/include/asm/bitsperlong.h | 1 + arch/blackfin/include/asm/bitsperlong.h | 1 + arch/cris/include/asm/bitsperlong.h | 1 + arch/frv/include/asm/bitsperlong.h | 1 + arch/h8300/include/asm/bitsperlong.h | 1 + arch/ia64/include/asm/bitsperlong.h | 8 ++++++++ arch/ia64/include/asm/types.h | 7 ------- arch/m32r/include/asm/bitsperlong.h | 1 + arch/m68k/include/asm/bitsperlong.h | 1 + arch/microblaze/include/asm/bitsperlong.h | 1 + arch/mips/include/asm/bitsperlong.h | 8 ++++++++ arch/mips/include/asm/types.h | 3 --- arch/mn10300/include/asm/bitsperlong.h | 1 + arch/parisc/include/asm/bitsperlong.h | 20 +++++++++++++++++++ arch/parisc/include/asm/types.h | 8 -------- arch/powerpc/include/asm/bitsperlong.h | 12 ++++++++++++ arch/powerpc/include/asm/types.h | 9 --------- arch/s390/include/asm/bitsperlong.h | 13 +++++++++++++ arch/s390/include/asm/types.h | 6 ------ arch/sh/include/asm/bitsperlong.h | 1 + arch/sparc/include/asm/bitsperlong.h | 13 +++++++++++++ arch/sparc/include/asm/types.h | 4 ---- arch/x86/include/asm/bitsperlong.h | 13 +++++++++++++ arch/x86/include/asm/types.h | 6 ------ arch/xtensa/include/asm/bitsperlong.h | 1 + include/asm-generic/Kbuild | 1 + include/asm-generic/Kbuild.asm | 1 + include/asm-generic/bitsperlong.h | 32 +++++++++++++++++++++++++++++++ include/asm-generic/int-l64.h | 2 ++ include/asm-generic/int-ll64.h | 2 ++ 33 files changed, 145 insertions(+), 46 deletions(-) create mode 100644 arch/alpha/include/asm/bitsperlong.h create mode 100644 arch/arm/include/asm/bitsperlong.h create mode 100644 arch/avr32/include/asm/bitsperlong.h create mode 100644 arch/blackfin/include/asm/bitsperlong.h create mode 100644 arch/cris/include/asm/bitsperlong.h create mode 100644 arch/frv/include/asm/bitsperlong.h create mode 100644 arch/h8300/include/asm/bitsperlong.h create mode 100644 arch/ia64/include/asm/bitsperlong.h create mode 100644 arch/m32r/include/asm/bitsperlong.h create mode 100644 arch/m68k/include/asm/bitsperlong.h create mode 100644 arch/microblaze/include/asm/bitsperlong.h create mode 100644 arch/mips/include/asm/bitsperlong.h create mode 100644 arch/mn10300/include/asm/bitsperlong.h create mode 100644 arch/parisc/include/asm/bitsperlong.h create mode 100644 arch/powerpc/include/asm/bitsperlong.h create mode 100644 arch/s390/include/asm/bitsperlong.h create mode 100644 arch/sh/include/asm/bitsperlong.h create mode 100644 arch/sparc/include/asm/bitsperlong.h create mode 100644 arch/x86/include/asm/bitsperlong.h create mode 100644 arch/xtensa/include/asm/bitsperlong.h create mode 100644 include/asm-generic/bitsperlong.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/bitsperlong.h b/arch/alpha/include/asm/bitsperlong.h new file mode 100644 index 000000000000..ad57f7868203 --- /dev/null +++ b/arch/alpha/include/asm/bitsperlong.h @@ -0,0 +1,8 @@ +#ifndef __ASM_ALPHA_BITSPERLONG_H +#define __ASM_ALPHA_BITSPERLONG_H + +#define __BITS_PER_LONG 64 + +#include + +#endif /* __ASM_ALPHA_BITSPERLONG_H */ diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h index f072f344497e..bd621ecd1eb3 100644 --- a/arch/alpha/include/asm/types.h +++ b/arch/alpha/include/asm/types.h @@ -25,9 +25,6 @@ typedef unsigned int umode_t; * These aren't exported outside the kernel to avoid name space clashes */ #ifdef __KERNEL__ - -#define BITS_PER_LONG 64 - #ifndef __ASSEMBLY__ typedef u64 dma_addr_t; diff --git a/arch/arm/include/asm/bitsperlong.h b/arch/arm/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/arm/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/avr32/include/asm/bitsperlong.h b/arch/avr32/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/avr32/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/blackfin/include/asm/bitsperlong.h b/arch/blackfin/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/blackfin/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/cris/include/asm/bitsperlong.h b/arch/cris/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/cris/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/include/asm/bitsperlong.h b/arch/frv/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/frv/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/h8300/include/asm/bitsperlong.h b/arch/h8300/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/h8300/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/ia64/include/asm/bitsperlong.h b/arch/ia64/include/asm/bitsperlong.h new file mode 100644 index 000000000000..ec4db3c970b7 --- /dev/null +++ b/arch/ia64/include/asm/bitsperlong.h @@ -0,0 +1,8 @@ +#ifndef __ASM_IA64_BITSPERLONG_H +#define __ASM_IA64_BITSPERLONG_H + +#define __BITS_PER_LONG 64 + +#include + +#endif /* __ASM_IA64_BITSPERLONG_H */ diff --git a/arch/ia64/include/asm/types.h b/arch/ia64/include/asm/types.h index e36b3716e718..fbf1ed3b44ce 100644 --- a/arch/ia64/include/asm/types.h +++ b/arch/ia64/include/asm/types.h @@ -19,10 +19,6 @@ # define __IA64_UL(x) (x) # define __IA64_UL_CONST(x) x -# ifdef __KERNEL__ -# define BITS_PER_LONG 64 -# endif - #else # define __IA64_UL(x) ((unsigned long)(x)) # define __IA64_UL_CONST(x) x##UL @@ -34,10 +30,7 @@ typedef unsigned int umode_t; */ # ifdef __KERNEL__ -#define BITS_PER_LONG 64 - /* DMA addresses are 64-bits wide, in general. */ - typedef u64 dma_addr_t; # endif /* __KERNEL__ */ diff --git a/arch/m32r/include/asm/bitsperlong.h b/arch/m32r/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/m32r/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/m68k/include/asm/bitsperlong.h b/arch/m68k/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/m68k/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/microblaze/include/asm/bitsperlong.h b/arch/microblaze/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/microblaze/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/mips/include/asm/bitsperlong.h b/arch/mips/include/asm/bitsperlong.h new file mode 100644 index 000000000000..3e4c10a8e787 --- /dev/null +++ b/arch/mips/include/asm/bitsperlong.h @@ -0,0 +1,8 @@ +#ifndef __ASM_MIPS_BITSPERLONG_H +#define __ASM_MIPS_BITSPERLONG_H + +#define __BITS_PER_LONG _MIPS_SZLONG + +#include + +#endif /* __ASM_MIPS_BITSPERLONG_H */ diff --git a/arch/mips/include/asm/types.h b/arch/mips/include/asm/types.h index 7956e69a3bd5..544a2854598f 100644 --- a/arch/mips/include/asm/types.h +++ b/arch/mips/include/asm/types.h @@ -31,9 +31,6 @@ typedef unsigned short umode_t; * These aren't exported outside the kernel to avoid name space clashes */ #ifdef __KERNEL__ - -#define BITS_PER_LONG _MIPS_SZLONG - #ifndef __ASSEMBLY__ #if (defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) \ diff --git a/arch/mn10300/include/asm/bitsperlong.h b/arch/mn10300/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/mn10300/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/parisc/include/asm/bitsperlong.h b/arch/parisc/include/asm/bitsperlong.h new file mode 100644 index 000000000000..75196b415d3f --- /dev/null +++ b/arch/parisc/include/asm/bitsperlong.h @@ -0,0 +1,20 @@ +#ifndef __ASM_PARISC_BITSPERLONG_H +#define __ASM_PARISC_BITSPERLONG_H + +/* + * using CONFIG_* outside of __KERNEL__ is wrong, + * __LP64__ was also removed from headers, so what + * is the right approach on parisc? + * -arnd + */ +#if (defined(__KERNEL__) && defined(CONFIG_64BIT)) || defined (__LP64__) +#define __BITS_PER_LONG 64 +#define SHIFT_PER_LONG 6 +#else +#define __BITS_PER_LONG 32 +#define SHIFT_PER_LONG 5 +#endif + +#include + +#endif /* __ASM_PARISC_BITSPERLONG_H */ diff --git a/arch/parisc/include/asm/types.h b/arch/parisc/include/asm/types.h index 7f5a39bfb4ce..20135cc80039 100644 --- a/arch/parisc/include/asm/types.h +++ b/arch/parisc/include/asm/types.h @@ -14,14 +14,6 @@ typedef unsigned short umode_t; */ #ifdef __KERNEL__ -#ifdef CONFIG_64BIT -#define BITS_PER_LONG 64 -#define SHIFT_PER_LONG 6 -#else -#define BITS_PER_LONG 32 -#define SHIFT_PER_LONG 5 -#endif - #ifndef __ASSEMBLY__ /* Dma addresses are 32-bits wide. */ diff --git a/arch/powerpc/include/asm/bitsperlong.h b/arch/powerpc/include/asm/bitsperlong.h new file mode 100644 index 000000000000..5f1659032c40 --- /dev/null +++ b/arch/powerpc/include/asm/bitsperlong.h @@ -0,0 +1,12 @@ +#ifndef __ASM_POWERPC_BITSPERLONG_H +#define __ASM_POWERPC_BITSPERLONG_H + +#if defined(__powerpc64__) +# define __BITS_PER_LONG 64 +#else +# define __BITS_PER_LONG 32 +#endif + +#include + +#endif /* __ASM_POWERPC_BITSPERLONG_H */ diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index 7ce27a52bb34..a5aea0ca34e9 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -40,15 +40,6 @@ typedef struct { #endif /* __ASSEMBLY__ */ #ifdef __KERNEL__ -/* - * These aren't exported outside the kernel to avoid name space clashes - */ -#ifdef __powerpc64__ -#define BITS_PER_LONG 64 -#else -#define BITS_PER_LONG 32 -#endif - #ifndef __ASSEMBLY__ typedef __vector128 vector128; diff --git a/arch/s390/include/asm/bitsperlong.h b/arch/s390/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6b235aea9c66 --- /dev/null +++ b/arch/s390/include/asm/bitsperlong.h @@ -0,0 +1,13 @@ +#ifndef __ASM_S390_BITSPERLONG_H +#define __ASM_S390_BITSPERLONG_H + +#ifndef __s390x__ +#define __BITS_PER_LONG 32 +#else +#define __BITS_PER_LONG 64 +#endif + +#include + +#endif /* __ASM_S390_BITSPERLONG_H */ + diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h index 3dc3fc228812..04d6b95a89c6 100644 --- a/arch/s390/include/asm/types.h +++ b/arch/s390/include/asm/types.h @@ -28,12 +28,6 @@ typedef __signed__ long saddr_t; */ #ifdef __KERNEL__ -#ifndef __s390x__ -#define BITS_PER_LONG 32 -#else -#define BITS_PER_LONG 64 -#endif - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/sh/include/asm/bitsperlong.h b/arch/sh/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/sh/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/arch/sparc/include/asm/bitsperlong.h b/arch/sparc/include/asm/bitsperlong.h new file mode 100644 index 000000000000..40dcaa3aaa56 --- /dev/null +++ b/arch/sparc/include/asm/bitsperlong.h @@ -0,0 +1,13 @@ +#ifndef __ASM_ALPHA_BITSPERLONG_H +#define __ASM_ALPHA_BITSPERLONG_H + +#if defined(__sparc__) && defined(__arch64__) +#define __BITS_PER_LONG 64 +#else +#define __BITS_PER_LONG 32 +#endif + +#include + +#endif /* __ASM_ALPHA_BITSPERLONG_H */ + diff --git a/arch/sparc/include/asm/types.h b/arch/sparc/include/asm/types.h index 2237118825d0..de671d73baed 100644 --- a/arch/sparc/include/asm/types.h +++ b/arch/sparc/include/asm/types.h @@ -21,8 +21,6 @@ typedef unsigned short umode_t; #ifdef __KERNEL__ -#define BITS_PER_LONG 64 - #ifndef __ASSEMBLY__ /* Dma addresses come in generic and 64-bit flavours. */ @@ -46,8 +44,6 @@ typedef unsigned short umode_t; #ifdef __KERNEL__ -#define BITS_PER_LONG 32 - #ifndef __ASSEMBLY__ typedef u32 dma_addr_t; diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h new file mode 100644 index 000000000000..b0ae1c4dc791 --- /dev/null +++ b/arch/x86/include/asm/bitsperlong.h @@ -0,0 +1,13 @@ +#ifndef __ASM_X86_BITSPERLONG_H +#define __ASM_X86_BITSPERLONG_H + +#ifdef __x86_64__ +# define __BITS_PER_LONG 64 +#else +# define __BITS_PER_LONG 32 +#endif + +#include + +#endif /* __ASM_X86_BITSPERLONG_H */ + diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index e6f736320077..09b97745772f 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -14,12 +14,6 @@ typedef unsigned short umode_t; */ #ifdef __KERNEL__ -#ifdef CONFIG_X86_32 -# define BITS_PER_LONG 32 -#else -# define BITS_PER_LONG 64 -#endif - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/xtensa/include/asm/bitsperlong.h b/arch/xtensa/include/asm/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/arch/xtensa/include/asm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 460b08d51e2e..cbb437875f5c 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -1,3 +1,4 @@ +header-y += bitsperlong.h header-y += errno-base.h header-y += errno.h header-y += fcntl.h diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 70d185534b9d..290910e4ede4 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -9,6 +9,7 @@ unifdef-y += a.out.h endif unifdef-y += auxvec.h unifdef-y += byteorder.h +unifdef-y += bitsperlong.h unifdef-y += errno.h unifdef-y += fcntl.h unifdef-y += ioctl.h diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h new file mode 100644 index 000000000000..4ae54e07de83 --- /dev/null +++ b/include/asm-generic/bitsperlong.h @@ -0,0 +1,32 @@ +#ifndef __ASM_GENERIC_BITS_PER_LONG +#define __ASM_GENERIC_BITS_PER_LONG + +/* + * There seems to be no way of detecting this automatically from user + * space, so 64 bit architectures should override this in their + * bitsperlong.h. In particular, an architecture that supports + * both 32 and 64 bit user space must not rely on CONFIG_64BIT + * to decide it, but rather check a compiler provided macro. + */ +#ifndef __BITS_PER_LONG +#define __BITS_PER_LONG 32 +#endif + +#ifdef __KERNEL__ + +#ifdef CONFIG_64BIT +#define BITS_PER_LONG 64 +#else +#define BITS_PER_LONG 32 +#endif /* CONFIG_64BIT */ + +/* + * FIXME: The check currently breaks x86-64 build, so it's + * temporarily disabled. Please fix x86-64 and reenable + */ +#if 0 && BITS_PER_LONG != __BITS_PER_LONG +#error Inconsistent word size. Check asm/bitsperlong.h +#endif + +#endif /* __KERNEL__ */ +#endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/asm-generic/int-l64.h b/include/asm-generic/int-l64.h index 2af9b75d77db..1ca3efc976cc 100644 --- a/include/asm-generic/int-l64.h +++ b/include/asm-generic/int-l64.h @@ -8,6 +8,8 @@ #ifndef _ASM_GENERIC_INT_L64_H #define _ASM_GENERIC_INT_L64_H +#include + #ifndef __ASSEMBLY__ /* * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the diff --git a/include/asm-generic/int-ll64.h b/include/asm-generic/int-ll64.h index f9bc9ac29b36..f394147c0739 100644 --- a/include/asm-generic/int-ll64.h +++ b/include/asm-generic/int-ll64.h @@ -8,6 +8,8 @@ #ifndef _ASM_GENERIC_INT_LL64_H #define _ASM_GENERIC_INT_LL64_H +#include + #ifndef __ASSEMBLY__ /* * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the -- cgit v1.2.3 From 72099ed2719fc5829bd79c6ca9d1783ed026eb37 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 May 2009 22:56:29 +0000 Subject: asm-generic: rename atomic.h to atomic-long.h The existing asm-generic/atomic.h only defines the atomic_long type. This renames it to atomic-long.h so we have a place to add a truly generic atomic.h that can be used on all non-SMP systems. Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann Acked-by: Ingo Molnar --- arch/alpha/include/asm/atomic.h | 2 +- arch/arm/include/asm/atomic.h | 2 +- arch/avr32/include/asm/atomic.h | 2 +- arch/blackfin/include/asm/atomic.h | 2 +- arch/cris/include/asm/atomic.h | 2 +- arch/frv/include/asm/atomic.h | 2 +- arch/h8300/include/asm/atomic.h | 2 +- arch/ia64/include/asm/atomic.h | 2 +- arch/m32r/include/asm/atomic.h | 2 +- arch/m68k/include/asm/atomic_mm.h | 2 +- arch/m68k/include/asm/atomic_no.h | 2 +- arch/microblaze/include/asm/atomic.h | 2 +- arch/mips/include/asm/atomic.h | 2 +- arch/mn10300/include/asm/atomic.h | 2 +- arch/parisc/include/asm/atomic.h | 2 +- arch/powerpc/include/asm/atomic.h | 2 +- arch/s390/include/asm/atomic.h | 2 +- arch/sh/include/asm/atomic.h | 2 +- arch/sparc/include/asm/atomic_32.h | 2 +- arch/sparc/include/asm/atomic_64.h | 2 +- arch/x86/include/asm/atomic_32.h | 2 +- arch/x86/include/asm/atomic_64.h | 2 +- arch/xtensa/include/asm/atomic.h | 2 +- include/asm-generic/atomic-long.h | 258 +++++++++++++++++++++++++++++++++++ include/asm-generic/atomic.h | 258 ----------------------------------- include/asm-generic/bitops/atomic.h | 1 + 26 files changed, 282 insertions(+), 281 deletions(-) create mode 100644 include/asm-generic/atomic-long.h delete mode 100644 include/asm-generic/atomic.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 62b363584b2b..610dff44d94b 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -256,5 +256,5 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) #define smp_mb__before_atomic_inc() smp_mb() #define smp_mb__after_atomic_inc() smp_mb() -#include +#include #endif /* _ALPHA_ATOMIC_H */ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 16b52f397983..9e07fe507029 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -249,6 +249,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() smp_mb() #define smp_mb__after_atomic_inc() smp_mb() -#include +#include #endif #endif diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index 318815107748..b131c27ddf57 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -196,6 +196,6 @@ static inline int atomic_sub_if_positive(int i, atomic_t *v) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __ASM_AVR32_ATOMIC_H */ diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h index 94b2a9b19451..7bbf44e4ddf9 100644 --- a/arch/blackfin/include/asm/atomic.h +++ b/arch/blackfin/include/asm/atomic.h @@ -208,6 +208,6 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v) #define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) -#include +#include #endif /* __ARCH_BLACKFIN_ATOMIC __ */ diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h index 5718dd8902a1..a6aca819e9f3 100644 --- a/arch/cris/include/asm/atomic.h +++ b/arch/cris/include/asm/atomic.h @@ -158,5 +158,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index 296c35cfb207..0409d981fd39 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -194,5 +194,5 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -#include +#include #endif /* _ASM_ATOMIC_H */ diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 833186c8dc3b..33c8c0fa9583 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -141,5 +141,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, unsigned long *v) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __ARCH_H8300_ATOMIC __ */ diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index d37292bd9875..88405cb0832a 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -216,5 +216,5 @@ atomic64_add_negative (__s64 i, atomic64_t *v) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* _ASM_IA64_ATOMIC_H */ diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index 2eed30f84080..63f0cf0f50dd 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -314,5 +314,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, atomic_t *addr) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* _ASM_M32R_ATOMIC_H */ diff --git a/arch/m68k/include/asm/atomic_mm.h b/arch/m68k/include/asm/atomic_mm.h index eb0ab9d4ee77..88b7af20a996 100644 --- a/arch/m68k/include/asm/atomic_mm.h +++ b/arch/m68k/include/asm/atomic_mm.h @@ -192,5 +192,5 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __ARCH_M68K_ATOMIC __ */ diff --git a/arch/m68k/include/asm/atomic_no.h b/arch/m68k/include/asm/atomic_no.h index 6bb674855a3f..5674cb9449bd 100644 --- a/arch/m68k/include/asm/atomic_no.h +++ b/arch/m68k/include/asm/atomic_no.h @@ -151,5 +151,5 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #define atomic_dec_return(v) atomic_sub_return(1,(v)) #define atomic_inc_return(v) atomic_add_return(1,(v)) -#include +#include #endif /* __ARCH_M68KNOMMU_ATOMIC __ */ diff --git a/arch/microblaze/include/asm/atomic.h b/arch/microblaze/include/asm/atomic.h index a448d94ab721..0de612ad7cb2 100644 --- a/arch/microblaze/include/asm/atomic.h +++ b/arch/microblaze/include/asm/atomic.h @@ -118,6 +118,6 @@ static inline int atomic_dec_if_positive(atomic_t *v) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* _ASM_MICROBLAZE_ATOMIC_H */ diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 1b332e15ab52..eb7f01cfd1ac 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -793,6 +793,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) #define smp_mb__before_atomic_inc() smp_llsc_mb() #define smp_mb__after_atomic_inc() smp_llsc_mb() -#include +#include #endif /* _ASM_ATOMIC_H */ diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index bc064825f9b1..5bf5be9566de 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -151,7 +151,7 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __KERNEL__ */ #endif /* _ASM_ATOMIC_H */ diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index ada3e5364d82..7eeaff944360 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -338,6 +338,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) #endif /* CONFIG_64BIT */ -#include +#include #endif /* _ASM_PARISC_ATOMIC_H_ */ diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index b401950f5259..b7d2d07b6f96 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -472,6 +472,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) #endif /* __powerpc64__ */ -#include +#include #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_ATOMIC_H_ */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index de432f2de2d2..fca9dffcc669 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -275,6 +275,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, #define smp_mb__before_atomic_inc() smp_mb() #define smp_mb__after_atomic_inc() smp_mb() -#include +#include #endif /* __KERNEL__ */ #endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index 6327ffbb1992..a5647d0cd179 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -84,5 +84,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __ASM_SH_ATOMIC_H */ diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index bb91b1248cd1..f0d343c3b956 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -161,5 +161,5 @@ static inline int __atomic24_sub(int i, atomic24_t *v) #endif /* !(__KERNEL__) */ -#include +#include #endif /* !(__ARCH_SPARC_ATOMIC__) */ diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index a0a706492696..f2e48009989e 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -114,5 +114,5 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* !(__ARCH_SPARC64_ATOMIC__) */ diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 85b46fba4229..c83d31486081 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -247,5 +247,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 8c21731984da..0d6360220007 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -455,5 +455,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* _ASM_X86_ATOMIC_64_H */ diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 67ad67bed8c1..22d6dde42619 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -292,7 +292,7 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include +#include #endif /* __KERNEL__ */ #endif /* _XTENSA_ATOMIC_H */ diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h new file mode 100644 index 000000000000..76e27d66c055 --- /dev/null +++ b/include/asm-generic/atomic-long.h @@ -0,0 +1,258 @@ +#ifndef _ASM_GENERIC_ATOMIC_LONG_H +#define _ASM_GENERIC_ATOMIC_LONG_H +/* + * Copyright (C) 2005 Silicon Graphics, Inc. + * Christoph Lameter + * + * Allows to provide arch independent atomic definitions without the need to + * edit all arch specific atomic.h files. + */ + +#include + +/* + * Suppport for atomic_long_t + * + * Casts for parameters are avoided for existing atomic functions in order to + * avoid issues with cast-as-lval under gcc 4.x and other limitations that the + * macros of a platform may have. + */ + +#if BITS_PER_LONG == 64 + +typedef atomic64_t atomic_long_t; + +#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) + +static inline long atomic_long_read(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_read(v); +} + +static inline void atomic_long_set(atomic_long_t *l, long i) +{ + atomic64_t *v = (atomic64_t *)l; + + atomic64_set(v, i); +} + +static inline void atomic_long_inc(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + atomic64_inc(v); +} + +static inline void atomic_long_dec(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + atomic64_dec(v); +} + +static inline void atomic_long_add(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + atomic64_add(i, v); +} + +static inline void atomic_long_sub(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + atomic64_sub(i, v); +} + +static inline int atomic_long_sub_and_test(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return atomic64_sub_and_test(i, v); +} + +static inline int atomic_long_dec_and_test(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return atomic64_dec_and_test(v); +} + +static inline int atomic_long_inc_and_test(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return atomic64_inc_and_test(v); +} + +static inline int atomic_long_add_negative(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return atomic64_add_negative(i, v); +} + +static inline long atomic_long_add_return(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_add_return(i, v); +} + +static inline long atomic_long_sub_return(long i, atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_sub_return(i, v); +} + +static inline long atomic_long_inc_return(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_inc_return(v); +} + +static inline long atomic_long_dec_return(atomic_long_t *l) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_dec_return(v); +} + +static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) +{ + atomic64_t *v = (atomic64_t *)l; + + return (long)atomic64_add_unless(v, a, u); +} + +#define atomic_long_inc_not_zero(l) atomic64_inc_not_zero((atomic64_t *)(l)) + +#define atomic_long_cmpxchg(l, old, new) \ + (atomic64_cmpxchg((atomic64_t *)(l), (old), (new))) +#define atomic_long_xchg(v, new) \ + (atomic64_xchg((atomic64_t *)(l), (new))) + +#else /* BITS_PER_LONG == 64 */ + +typedef atomic_t atomic_long_t; + +#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) +static inline long atomic_long_read(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_read(v); +} + +static inline void atomic_long_set(atomic_long_t *l, long i) +{ + atomic_t *v = (atomic_t *)l; + + atomic_set(v, i); +} + +static inline void atomic_long_inc(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + atomic_inc(v); +} + +static inline void atomic_long_dec(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + atomic_dec(v); +} + +static inline void atomic_long_add(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + atomic_add(i, v); +} + +static inline void atomic_long_sub(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + atomic_sub(i, v); +} + +static inline int atomic_long_sub_and_test(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return atomic_sub_and_test(i, v); +} + +static inline int atomic_long_dec_and_test(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return atomic_dec_and_test(v); +} + +static inline int atomic_long_inc_and_test(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return atomic_inc_and_test(v); +} + +static inline int atomic_long_add_negative(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return atomic_add_negative(i, v); +} + +static inline long atomic_long_add_return(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_add_return(i, v); +} + +static inline long atomic_long_sub_return(long i, atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_sub_return(i, v); +} + +static inline long atomic_long_inc_return(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_inc_return(v); +} + +static inline long atomic_long_dec_return(atomic_long_t *l) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_dec_return(v); +} + +static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) +{ + atomic_t *v = (atomic_t *)l; + + return (long)atomic_add_unless(v, a, u); +} + +#define atomic_long_inc_not_zero(l) atomic_inc_not_zero((atomic_t *)(l)) + +#define atomic_long_cmpxchg(l, old, new) \ + (atomic_cmpxchg((atomic_t *)(l), (old), (new))) +#define atomic_long_xchg(v, new) \ + (atomic_xchg((atomic_t *)(v), (new))) + +#endif /* BITS_PER_LONG == 64 */ + +#endif /* _ASM_GENERIC_ATOMIC_LONG_H */ diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h deleted file mode 100644 index 3673a13b6703..000000000000 --- a/include/asm-generic/atomic.h +++ /dev/null @@ -1,258 +0,0 @@ -#ifndef _ASM_GENERIC_ATOMIC_H -#define _ASM_GENERIC_ATOMIC_H -/* - * Copyright (C) 2005 Silicon Graphics, Inc. - * Christoph Lameter - * - * Allows to provide arch independent atomic definitions without the need to - * edit all arch specific atomic.h files. - */ - -#include - -/* - * Suppport for atomic_long_t - * - * Casts for parameters are avoided for existing atomic functions in order to - * avoid issues with cast-as-lval under gcc 4.x and other limitations that the - * macros of a platform may have. - */ - -#if BITS_PER_LONG == 64 - -typedef atomic64_t atomic_long_t; - -#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) - -static inline long atomic_long_read(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_read(v); -} - -static inline void atomic_long_set(atomic_long_t *l, long i) -{ - atomic64_t *v = (atomic64_t *)l; - - atomic64_set(v, i); -} - -static inline void atomic_long_inc(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - atomic64_inc(v); -} - -static inline void atomic_long_dec(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - atomic64_dec(v); -} - -static inline void atomic_long_add(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - atomic64_add(i, v); -} - -static inline void atomic_long_sub(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - atomic64_sub(i, v); -} - -static inline int atomic_long_sub_and_test(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return atomic64_sub_and_test(i, v); -} - -static inline int atomic_long_dec_and_test(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return atomic64_dec_and_test(v); -} - -static inline int atomic_long_inc_and_test(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return atomic64_inc_and_test(v); -} - -static inline int atomic_long_add_negative(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return atomic64_add_negative(i, v); -} - -static inline long atomic_long_add_return(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_add_return(i, v); -} - -static inline long atomic_long_sub_return(long i, atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_sub_return(i, v); -} - -static inline long atomic_long_inc_return(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_inc_return(v); -} - -static inline long atomic_long_dec_return(atomic_long_t *l) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_dec_return(v); -} - -static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) -{ - atomic64_t *v = (atomic64_t *)l; - - return (long)atomic64_add_unless(v, a, u); -} - -#define atomic_long_inc_not_zero(l) atomic64_inc_not_zero((atomic64_t *)(l)) - -#define atomic_long_cmpxchg(l, old, new) \ - (atomic64_cmpxchg((atomic64_t *)(l), (old), (new))) -#define atomic_long_xchg(v, new) \ - (atomic64_xchg((atomic64_t *)(l), (new))) - -#else /* BITS_PER_LONG == 64 */ - -typedef atomic_t atomic_long_t; - -#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) -static inline long atomic_long_read(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_read(v); -} - -static inline void atomic_long_set(atomic_long_t *l, long i) -{ - atomic_t *v = (atomic_t *)l; - - atomic_set(v, i); -} - -static inline void atomic_long_inc(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - atomic_inc(v); -} - -static inline void atomic_long_dec(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - atomic_dec(v); -} - -static inline void atomic_long_add(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - atomic_add(i, v); -} - -static inline void atomic_long_sub(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - atomic_sub(i, v); -} - -static inline int atomic_long_sub_and_test(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return atomic_sub_and_test(i, v); -} - -static inline int atomic_long_dec_and_test(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return atomic_dec_and_test(v); -} - -static inline int atomic_long_inc_and_test(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return atomic_inc_and_test(v); -} - -static inline int atomic_long_add_negative(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return atomic_add_negative(i, v); -} - -static inline long atomic_long_add_return(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_add_return(i, v); -} - -static inline long atomic_long_sub_return(long i, atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_sub_return(i, v); -} - -static inline long atomic_long_inc_return(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_inc_return(v); -} - -static inline long atomic_long_dec_return(atomic_long_t *l) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_dec_return(v); -} - -static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) -{ - atomic_t *v = (atomic_t *)l; - - return (long)atomic_add_unless(v, a, u); -} - -#define atomic_long_inc_not_zero(l) atomic_inc_not_zero((atomic_t *)(l)) - -#define atomic_long_cmpxchg(l, old, new) \ - (atomic_cmpxchg((atomic_t *)(l), (old), (new))) -#define atomic_long_xchg(v, new) \ - (atomic_xchg((atomic_t *)(v), (new))) - -#endif /* BITS_PER_LONG == 64 */ - -#endif /* _ASM_GENERIC_ATOMIC_H */ diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index 4657f3e410fc..c8946465e63a 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_BITOPS_ATOMIC_H_ #include +#include #ifdef CONFIG_SMP #include -- cgit v1.2.3 From 5b17e1cd8928ae65932758ce6478ac6d3e9a86b2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 May 2009 22:56:30 +0000 Subject: asm-generic: rename page.h and uaccess.h The current asm-generic/page.h only contains the get_order function, and asm-generic/uaccess.h only implements unaligned accesses. This renames the file to getorder.h and uaccess-unaligned.h to make room for new page.h and uaccess.h file that will be usable by all simple (e.g. nommu) architectures. Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann --- arch/alpha/include/asm/page.h | 2 +- arch/arm/include/asm/page.h | 2 +- arch/blackfin/include/asm/page.h | 2 +- arch/cris/include/asm/page.h | 2 +- arch/frv/include/asm/page.h | 2 +- arch/h8300/include/asm/page.h | 2 +- arch/m32r/include/asm/page.h | 2 +- arch/m68k/include/asm/page_mm.h | 2 +- arch/m68k/include/asm/page_no.h | 2 +- arch/microblaze/include/asm/page.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/parisc/include/asm/page.h | 2 +- arch/parisc/include/asm/uaccess.h | 2 +- arch/powerpc/include/asm/page_32.h | 2 +- arch/powerpc/include/asm/page_64.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/sh/include/asm/page.h | 2 +- arch/sparc/include/asm/page_32.h | 2 +- arch/sparc/include/asm/page_64.h | 2 +- arch/sparc/include/asm/uaccess_64.h | 2 +- arch/um/include/asm/page.h | 2 +- arch/x86/include/asm/page.h | 2 +- arch/xtensa/include/asm/page.h | 2 +- include/asm-generic/getorder.h | 24 ++++++++++++++++++++++++ include/asm-generic/page.h | 24 ------------------------ include/asm-generic/uaccess-unaligned.h | 26 ++++++++++++++++++++++++++ include/asm-generic/uaccess.h | 26 -------------------------- 27 files changed, 73 insertions(+), 73 deletions(-) create mode 100644 include/asm-generic/getorder.h delete mode 100644 include/asm-generic/page.h create mode 100644 include/asm-generic/uaccess-unaligned.h delete mode 100644 include/asm-generic/uaccess.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 0995f9d13417..07af062544fb 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -93,6 +93,6 @@ typedef struct page *pgtable_t; VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include #endif /* _ALPHA_PAGE_H */ diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 7b522770f29d..be962c1349c4 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -202,6 +202,6 @@ typedef struct page *pgtable_t; (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#include +#include #endif diff --git a/arch/blackfin/include/asm/page.h b/arch/blackfin/include/asm/page.h index 344f6a8c1f22..3ea2016a1d4a 100644 --- a/arch/blackfin/include/asm/page.h +++ b/arch/blackfin/include/asm/page.h @@ -81,7 +81,7 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) -#include +#include #endif /* __ASSEMBLY__ */ diff --git a/arch/cris/include/asm/page.h b/arch/cris/include/asm/page.h index f3fdbd09c34c..be45ee366be9 100644 --- a/arch/cris/include/asm/page.h +++ b/arch/cris/include/asm/page.h @@ -68,7 +68,7 @@ typedef struct page *pgtable_t; VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include #endif /* _CRIS_PAGE_H */ diff --git a/arch/frv/include/asm/page.h b/arch/frv/include/asm/page.h index bd9c220094c7..25c6a5002355 100644 --- a/arch/frv/include/asm/page.h +++ b/arch/frv/include/asm/page.h @@ -73,6 +73,6 @@ extern unsigned long max_pfn; #endif /* __ASSEMBLY__ */ #include -#include +#include #endif /* _ASM_PAGE_H */ diff --git a/arch/h8300/include/asm/page.h b/arch/h8300/include/asm/page.h index 0b6acf0b03aa..837381a2df46 100644 --- a/arch/h8300/include/asm/page.h +++ b/arch/h8300/include/asm/page.h @@ -73,6 +73,6 @@ extern unsigned long memory_end; #endif /* __ASSEMBLY__ */ #include -#include +#include #endif /* _H8300_PAGE_H */ diff --git a/arch/m32r/include/asm/page.h b/arch/m32r/include/asm/page.h index c9333089fe11..11777f7a5628 100644 --- a/arch/m32r/include/asm/page.h +++ b/arch/m32r/include/asm/page.h @@ -82,6 +82,6 @@ typedef struct page *pgtable_t; #define devmem_is_allowed(x) 1 #include -#include +#include #endif /* _ASM_M32R_PAGE_H */ diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index a34b8bad7847..d009f3ea39ab 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -223,6 +223,6 @@ static inline __attribute_const__ int __virt_to_node_shift(void) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#include +#include #endif /* _M68K_PAGE_H */ diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 3a1ede4544cb..9aa3f90f4855 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -72,6 +72,6 @@ extern unsigned long memory_end; #endif /* __ASSEMBLY__ */ -#include +#include #endif /* _M68KNOMMU_PAGE_H */ diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 7238dcfcc517..962c210e5b9a 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -135,6 +135,6 @@ extern unsigned int memory_size; #endif /* __KERNEL__ */ #include -#include +#include #endif /* _ASM_MICROBLAZE_PAGE_H */ diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 9f946e4ca057..72c80d2034c2 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -189,6 +189,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define CAC_ADDR(addr) ((addr) - UNCAC_BASE + PAGE_OFFSET) #include -#include +#include #endif /* _ASM_PAGE_H */ diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 7bc5125d7d4c..a84cc1f925f6 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -159,6 +159,6 @@ extern int npmem_ranges; VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include #endif /* _PARISC_PAGE_H */ diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index cd4c0b2a8e70..7cf799d70b4c 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #define VERIFY_READ 0 #define VERIFY_WRITE 1 diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index a0e3f6e6b4ee..bd0849dbcaaa 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -41,7 +41,7 @@ extern void clear_pages(void *page, int order); static inline void clear_page(void *page) { clear_pages(page, 0); } extern void copy_page(void *to, void *from); -#include +#include #define PGD_T_LOG2 (__builtin_ffs(sizeof(pgd_t)) - 1) #define PTE_T_LOG2 (__builtin_ffs(sizeof(pte_t)) - 1) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 043bfdfe4f73..5817a3b747e5 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -180,6 +180,6 @@ do { \ (test_thread_flag(TIF_32BIT) ? \ VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) -#include +#include #endif /* _ASM_POWERPC_PAGE_64_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 32e8f6aa4384..3e3594d01f83 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -150,7 +150,7 @@ void arch_alloc_page(struct page *page, int order); VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include #define __HAVE_ARCH_GATE_AREA 1 diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index 9c6d21ec0240..49592c780a6e 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -163,7 +163,7 @@ typedef struct page *pgtable_t; VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include /* vDSO support */ #ifdef CONFIG_VSYSCALL diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index d1806edc0958..f72080bdda94 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -152,6 +152,6 @@ extern unsigned long pfn_base; VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #include -#include +#include #endif /* _SPARC_PAGE_H */ diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index 4274ed13ddb2..f0d09b401036 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -132,6 +132,6 @@ typedef struct page *pgtable_t; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#include +#include #endif /* _SPARC64_PAGE_H */ diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index c64e767a3e4b..a38c03238918 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #endif #ifndef __ASSEMBLY__ diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 55f28a0bae6d..4cc9b6cf480a 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -116,7 +116,7 @@ extern unsigned long uml_physmem; #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include -#include +#include #endif /* __ASSEMBLY__ */ #endif /* __UM_PAGE_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 89ed9d70b0aa..625c3f0e741a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr); #endif /* __ASSEMBLY__ */ #include -#include +#include #define __HAVE_ARCH_GATE_AREA 1 diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 17e0c5383b10..161bb89e98c8 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -129,7 +129,7 @@ static inline __attribute_const__ int get_order(unsigned long size) #else -# include +# include #endif diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h new file mode 100644 index 000000000000..67e7245dc9b3 --- /dev/null +++ b/include/asm-generic/getorder.h @@ -0,0 +1,24 @@ +#ifndef __ASM_GENERIC_GETORDER_H +#define __ASM_GENERIC_GETORDER_H + +#ifndef __ASSEMBLY__ + +#include + +/* Pure 2^n version of get_order */ +static inline __attribute_const__ int get_order(unsigned long size) +{ + int order; + + size = (size - 1) >> (PAGE_SHIFT - 1); + order = -1; + do { + size >>= 1; + order++; + } while (size); + return order; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_GENERIC_GETORDER_H */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h deleted file mode 100644 index 14db733b8e68..000000000000 --- a/include/asm-generic/page.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _ASM_GENERIC_PAGE_H -#define _ASM_GENERIC_PAGE_H - -#ifndef __ASSEMBLY__ - -#include - -/* Pure 2^n version of get_order */ -static __inline__ __attribute_const__ int get_order(unsigned long size) -{ - int order; - - size = (size - 1) >> (PAGE_SHIFT - 1); - order = -1; - do { - size >>= 1; - order++; - } while (size); - return order; -} - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_GENERIC_PAGE_H */ diff --git a/include/asm-generic/uaccess-unaligned.h b/include/asm-generic/uaccess-unaligned.h new file mode 100644 index 000000000000..67deb898f0c5 --- /dev/null +++ b/include/asm-generic/uaccess-unaligned.h @@ -0,0 +1,26 @@ +#ifndef __ASM_GENERIC_UACCESS_UNALIGNED_H +#define __ASM_GENERIC_UACCESS_UNALIGNED_H + +/* + * This macro should be used instead of __get_user() when accessing + * values at locations that are not known to be aligned. + */ +#define __get_user_unaligned(x, ptr) \ +({ \ + __typeof__ (*(ptr)) __x; \ + __copy_from_user(&__x, (ptr), sizeof(*(ptr))) ? -EFAULT : 0; \ + (x) = __x; \ +}) + + +/* + * This macro should be used instead of __put_user() when accessing + * values at locations that are not known to be aligned. + */ +#define __put_user_unaligned(x, ptr) \ +({ \ + __typeof__ (*(ptr)) __x = (x); \ + __copy_to_user((ptr), &__x, sizeof(*(ptr))) ? -EFAULT : 0; \ +}) + +#endif /* __ASM_GENERIC_UACCESS_UNALIGNED_H */ diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h deleted file mode 100644 index 549cb3a1640a..000000000000 --- a/include/asm-generic/uaccess.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _ASM_GENERIC_UACCESS_H_ -#define _ASM_GENERIC_UACCESS_H_ - -/* - * This macro should be used instead of __get_user() when accessing - * values at locations that are not known to be aligned. - */ -#define __get_user_unaligned(x, ptr) \ -({ \ - __typeof__ (*(ptr)) __x; \ - __copy_from_user(&__x, (ptr), sizeof(*(ptr))) ? -EFAULT : 0; \ - (x) = __x; \ -}) - - -/* - * This macro should be used instead of __put_user() when accessing - * values at locations that are not known to be aligned. - */ -#define __put_user_unaligned(x, ptr) \ -({ \ - __typeof__ (*(ptr)) __x = (x); \ - __copy_to_user((ptr), &__x, sizeof(*(ptr))) ? -EFAULT : 0; \ -}) - -#endif /* _ASM_GENERIC_UACCESS_H */ -- cgit v1.2.3 From 8c5dd8f43367f4f266dd616f11658005bc2d20ef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Jun 2009 19:14:22 -0700 Subject: x86: handle initrd that extends into unusable memory On a system where system memory (according e820) is not covered by mtrr, mtrr_trim_memory converts a portion of memory to reserved, but bootloader has already put the initrd in that range. Thus, we need to have 64bit to use relocate_initrd too. [ Impact: fix using initrd when mtrr_trim_memory happen ] Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/kernel/setup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d1c636bf31a7..be5ae80f897f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -301,15 +301,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -365,14 +363,13 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } -#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -402,14 +399,8 @@ static void __init reserve_initrd(void) return; } -#ifdef CONFIG_X86_32 relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else -- cgit v1.2.3 From 1260866a271abc84274345580f3d613d3bceff87 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 11 May 2009 13:22:00 +0100 Subject: x86: Provide _sdata in the vmlinux.lds.S file _sdata is a common symbol defined by many architectures and made available to the kernel via asm-generic/sections.h. Kmemleak uses this symbol when scanning the data sections. [ Impact: add new global symbol ] Signed-off-by: Catalin Marinas LKML-Reference: <20090511122105.26556.96593.stgit@pc1117.cambridge.arm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c85b2e2bb65..367e87882041 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -108,6 +108,8 @@ SECTIONS /* Data */ . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; DATA_DATA CONSTRUCTORS -- cgit v1.2.3 From 55cd63676e0c5710fbe1ea86dfd9f8ea9aaa90f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 12 Jun 2009 11:36:52 +0300 Subject: x86: make zap_low_mapping could be used early Only one cpu is there, just call __flush_tlb for it. Fixes the following boot warning on x86: [ 0.000000] Memory: 885032k/915540k available (5993k kernel code, 29844k reserved, 3842k data, 428k init, 0k highmem) [ 0.000000] virtual kernel memory layout: [ 0.000000] fixmap : 0xffe17000 - 0xfffff000 (1952 kB) [ 0.000000] vmalloc : 0xf8615000 - 0xffe15000 ( 120 MB) [ 0.000000] lowmem : 0xc0000000 - 0xf7e15000 ( 894 MB) [ 0.000000] .init : 0xc19a5000 - 0xc1a10000 ( 428 kB) [ 0.000000] .data : 0xc15da4bb - 0xc199af6c (3842 kB) [ 0.000000] .text : 0xc1000000 - 0xc15da4bb (5993 kB) [ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...Ok. [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at kernel/smp.c:369 smp_call_function_many+0x50/0x1b0() [ 0.000000] Hardware name: System Product Name [ 0.000000] Modules linked in: [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip #52504 [ 0.000000] Call Trace: [ 0.000000] [] warn_slowpath_common+0x65/0x95 [ 0.000000] [] warn_slowpath_null+0x12/0x15 [ 0.000000] [] smp_call_function_many+0x50/0x1b0 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] smp_call_function+0x31/0x58 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] on_each_cpu+0x26/0x65 [ 0.000000] [] flush_tlb_all+0x19/0x1b [ 0.000000] [] zap_low_mappings+0x4d/0x56 [ 0.000000] [] ? printk+0x14/0x17 [ 0.000000] [] mem_init+0x23d/0x245 [ 0.000000] [] start_kernel+0x17a/0x2d5 [ 0.000000] [] ? unknown_bootoption+0x0/0x19a [ 0.000000] [] __init_begin+0x39/0x41 [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]--- Reported-by: Ingo Molnar Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/mm/init_32.c | 10 +++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index a5ecc9c33e92..7f3eba08e7de 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -extern void zap_low_mappings(void); +extern void zap_low_mappings(bool early); #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c80007ea5f7..2fecda69ee64 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); - zap_low_mappings(); + zap_low_mappings(false); low_mappings = 0; #else err = do_boot_cpu(apicid, cpu); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 949708d7a481..9ff3c0816d15 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -564,7 +564,7 @@ static inline void save_pg_dir(void) } #endif /* !CONFIG_ACPI_SLEEP */ -void zap_low_mappings(void) +void zap_low_mappings(bool early) { int i; @@ -581,7 +581,11 @@ void zap_low_mappings(void) set_pgd(swapper_pg_dir+i, __pgd(0)); #endif } - flush_tlb_all(); + + if (early) + __flush_tlb(); + else + flush_tlb_all(); } pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); @@ -956,7 +960,7 @@ void __init mem_init(void) test_wp_bit(); save_pg_dir(); - zap_low_mappings(); + zap_low_mappings(true); } #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.3 From dff5da6d09daaab40a8741dce0ed3c2e94079de2 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 12 Jun 2009 16:08:55 +0800 Subject: perf_counter/x86: Add a quirk for Atom processors The fixed-function performance counters do not work on current Atom processors. Use the general-purpose ones instead. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090612080855.GA2286@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 895c82e78455..275bc142cd5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -968,6 +968,13 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; + /* + * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86_model == 28) + return -1; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) -- cgit v1.2.3 From 2d5bf28fb9e3c178db4f5536e2fe38d3a5ed7f40 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:09 -0400 Subject: x86 module: merge the same functions in module_32.c and module_64.c Merge the same functions both in module_32.c and module_64.c into module.c. This is the first step to merge both of them finally. Signed-off-by: WANG Cong Cc: Ingo Molnar Signed-off-by: Rusty Russell --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/module.c | 98 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/module_32.c | 62 ---------------------------- arch/x86/kernel/module_64.c | 61 ---------------------------- 4 files changed, 99 insertions(+), 124 deletions(-) create mode 100644 arch/x86/kernel/module.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f78bd682125..1cd55c769cd0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o module_$(BITS).o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c new file mode 100644 index 000000000000..1ea42d083494 --- /dev/null +++ b/arch/x86/kernel/module.c @@ -0,0 +1,98 @@ +/* Kernel module help for x86. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt...) +#endif + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + return module_bug_finalize(hdr, sechdrs, me); +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); + module_bug_cleanup(mod); +} diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c index 0edd819050e7..6145e68feb04 100644 --- a/arch/x86/kernel/module_32.c +++ b/arch/x86/kernel/module_32.c @@ -37,23 +37,6 @@ void *module_alloc(unsigned long size) } -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -105,48 +88,3 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, return -ENOEXEC; } -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index c23880b90b5c..68ec7d87bb47 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -33,13 +33,6 @@ #define DEBUGP(fmt...) #ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - void *module_alloc(unsigned long size) { struct vm_struct *area; @@ -58,15 +51,6 @@ void *module_alloc(unsigned long size) } #endif -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -147,48 +131,3 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} -- cgit v1.2.3 From 0fdc83b950df9e2eb45db6fca9c3d92c66fd5028 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:19 -0400 Subject: x86 module: merge the rest functions with macros Merge the rest functions together, with proper preprocessing directives. Finally remove module_{32|64}.c. Signed-off-by: WANG Cong Cc: Ingo Molnar Signed-off-by: Rusty Russell --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/module.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/module_32.c | 90 ------------------------- arch/x86/kernel/module_64.c | 133 ------------------------------------ 4 files changed, 161 insertions(+), 224 deletions(-) delete mode 100644 arch/x86/kernel/module_32.c delete mode 100644 arch/x86/kernel/module_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1cd55c769cd0..f3477bb84566 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module.o module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 1ea42d083494..b92fbcf48609 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -34,6 +34,32 @@ #define DEBUGP(fmt...) #endif +#if defined(CONFIG_UML) || defined(CONFIG_X86_32) +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} +#else /*X86_64*/ +void *module_alloc(unsigned long size) +{ + struct vm_struct *area; + + if (!size) + return NULL; + size = PAGE_ALIGN(size); + if (size > MODULES_LEN) + return NULL; + + area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!area) + return NULL; + + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +} +#endif + /* Free memory returned from module_alloc */ void module_free(struct module *mod, void *module_region) { @@ -51,6 +77,140 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} +#else /*X86_64*/ +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "non add relocation not supported\n"); + return -ENOSYS; +} + +#endif + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 6145e68feb04..000000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null @@ -1,90 +0,0 @@ -/* Kernel module help for i386. - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} - - -int apply_relocate(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; - uint32_t *location; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr - + ELF32_R_SYM(rel[i].r_info); - - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ - *location += sym->st_value; - break; - case R_386_PC32: - /* Add the value, subtract its postition */ - *location += sym->st_value - (uint32_t)location; - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", - me->name, ELF32_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} - diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c deleted file mode 100644 index 68ec7d87bb47..000000000000 --- a/arch/x86/kernel/module_64.c +++ /dev/null @@ -1,133 +0,0 @@ -/* Kernel module help for x86-64 - Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void *module_alloc(unsigned long size) -{ - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); -} -#endif - -int apply_relocate_add(Elf64_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; - Elf64_Sym *sym; - void *loc; - u64 val; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf64_Sym *)sechdrs[symindex].sh_addr - + ELF64_R_SYM(rel[i].r_info); - - DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); - - val = sym->st_value + rel[i].r_addend; - - switch (ELF64_R_TYPE(rel[i].r_info)) { - case R_X86_64_NONE: - break; - case R_X86_64_64: - *(u64 *)loc = val; - break; - case R_X86_64_32: - *(u32 *)loc = val; - if (val != *(u32 *)loc) - goto overflow; - break; - case R_X86_64_32S: - *(s32 *)loc = val; - if ((s64)val != *(s32 *)loc) - goto overflow; - break; - case R_X86_64_PC32: - val -= (u64)loc; - *(u32 *)loc = val; -#if 0 - if ((s64)val != *(s32 *)loc) - goto overflow; -#endif - break; - default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", - me->name, ELF64_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; - -overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", - me->name); - return -ENOEXEC; -} - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "non add relocation not supported\n"); - return -ENOSYS; -} - -- cgit v1.2.3 From c398df30d5caad626ac72bfab0361a7b0f67a661 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:46 -0400 Subject: module: merge module_alloc() finally As Christoph Hellwig suggested, module_alloc() actually can be unified for i386 and x86_64 (of course, also UML). Signed-off-by: WANG Cong Cc: Christoph Hellwig Cc: 'Ingo Molnar' Signed-off-by: Rusty Russell --- arch/um/include/asm/pgtable.h | 7 ++++++- arch/x86/include/asm/pgtable_32_types.h | 4 ++++ arch/x86/kernel/module.c | 12 ++---------- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 58da2480a7f4..9ce3f165111a 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -53,16 +53,21 @@ extern unsigned long end_iomem; #else # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) #endif +#define MODULES_VADDR VMALLOC_START +#define MODULES_END VMALLOC_END +#define MODULES_LEN (MODULES_VADDR - MODULES_END) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) - +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) /* * The i386 can't do page protection for execute, and considers that the same diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 2733fad45f98..5e67c1532314 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif +#define MODULES_VADDR VMALLOC_START +#define MODULES_END VMALLOC_END +#define MODULES_LEN (MODULES_VADDR - MODULES_END) + #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b92fbcf48609..894bb718a6fb 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -34,14 +34,6 @@ #define DEBUGP(fmt...) #endif -#if defined(CONFIG_UML) || defined(CONFIG_X86_32) -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} -#else /*X86_64*/ void *module_alloc(unsigned long size) { struct vm_struct *area; @@ -56,9 +48,9 @@ void *module_alloc(unsigned long size) if (!area) return NULL; - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC); } -#endif /* Free memory returned from module_alloc */ void module_free(struct module *mod, void *module_region) -- cgit v1.2.3 From 5933048c69edb546f1e93c26dc93816f0be9f754 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:47:04 -0600 Subject: module: cleanup FIXME comments about trimming exception table entries. Everyone cut and paste this comment from my original one. We now do it generically, so cut the comments. Signed-off-by: Rusty Russell Cc: Amerigo Wang --- arch/avr32/kernel/module.c | 2 -- arch/cris/kernel/module.c | 2 -- arch/frv/kernel/module.c | 2 -- arch/h8300/kernel/module.c | 2 -- arch/m32r/kernel/module.c | 2 -- arch/m68k/kernel/module.c | 2 -- arch/m68knommu/kernel/module.c | 2 -- arch/mips/kernel/module.c | 2 -- arch/mn10300/kernel/module.c | 2 -- arch/parisc/kernel/module.c | 2 -- arch/powerpc/kernel/module.c | 2 -- arch/s390/kernel/module.c | 2 -- arch/sh/kernel/module.c | 2 -- arch/sparc/kernel/module.c | 2 -- arch/x86/kernel/module.c | 2 -- arch/xtensa/kernel/module.c | 2 -- 16 files changed, 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c index 1167fe9cf6c4..98f94d041d9c 100644 --- a/arch/avr32/kernel/module.c +++ b/arch/avr32/kernel/module.c @@ -32,8 +32,6 @@ void module_free(struct module *mod, void *module_region) mod->arch.syminfo = NULL; vfree(module_region); - /* FIXME: if module_region == mod->init_region, trim exception - * table entries. */ } static inline int check_rela(Elf32_Rela *rela, struct module *module, diff --git a/arch/cris/kernel/module.c b/arch/cris/kernel/module.c index a187833febc8..abc13e368b90 100644 --- a/arch/cris/kernel/module.c +++ b/arch/cris/kernel/module.c @@ -48,8 +48,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { FREE_MODULE(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/frv/kernel/module.c b/arch/frv/kernel/module.c index 850d168f69fc..711763c8a6f3 100644 --- a/arch/frv/kernel/module.c +++ b/arch/frv/kernel/module.c @@ -35,8 +35,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/h8300/kernel/module.c b/arch/h8300/kernel/module.c index cfc9127d2ced..0865e291c20d 100644 --- a/arch/h8300/kernel/module.c +++ b/arch/h8300/kernel/module.c @@ -23,8 +23,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/m32r/kernel/module.c b/arch/m32r/kernel/module.c index 8d4205794380..cb5f37d78d49 100644 --- a/arch/m32r/kernel/module.c +++ b/arch/m32r/kernel/module.c @@ -44,8 +44,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/m68k/kernel/module.c b/arch/m68k/kernel/module.c index 774862bc6977..cd6bcb1c957e 100644 --- a/arch/m68k/kernel/module.c +++ b/arch/m68k/kernel/module.c @@ -31,8 +31,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/m68knommu/kernel/module.c b/arch/m68knommu/kernel/module.c index 3b1a2ff61ddc..d11ffae7956a 100644 --- a/arch/m68knommu/kernel/module.c +++ b/arch/m68knommu/kernel/module.c @@ -23,8 +23,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 1f60e27523d9..3e9100dcc12d 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -68,8 +68,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/arch/mn10300/kernel/module.c b/arch/mn10300/kernel/module.c index 6b287f2e8e84..4fa0e3648d8e 100644 --- a/arch/mn10300/kernel/module.c +++ b/arch/mn10300/kernel/module.c @@ -48,8 +48,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - * table entries. */ } /* diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index ecd1c5024447..ef5caf2e6ed0 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -267,8 +267,6 @@ void module_free(struct module *mod, void *module_region) mod->arch.section = NULL; vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* Additional bytes needed in front of individual sections */ diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 43e7e3a7f130..477c663e0140 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -43,8 +43,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index eed4a00cb676..ab2e3ed28abc 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -56,8 +56,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } static void diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c index c19b0f7d2cc1..c2efdcde266f 100644 --- a/arch/sh/kernel/module.c +++ b/arch/sh/kernel/module.c @@ -46,8 +46,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index 90273765e81f..0ee642f63234 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -75,8 +75,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 894bb718a6fb..89f386f044e4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -56,8 +56,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ diff --git a/arch/xtensa/kernel/module.c b/arch/xtensa/kernel/module.c index 3981a466c779..c1accea8cb56 100644 --- a/arch/xtensa/kernel/module.c +++ b/arch/xtensa/kernel/module.c @@ -34,8 +34,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } int module_frob_arch_sections(Elf32_Ehdr *hdr, -- cgit v1.2.3 From 1028375e93a7aa4dbe466947d1c65f368b1f61c1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:26:59 -0600 Subject: lguest: clean up lguest_init_IRQ Copy from arch/x86/kernel/irqinit_32.c: we don't use the vectors beyond LGUEST_IRQS (if any), but we might as well set them all. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c26559395..2392a7a171c2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -628,13 +628,12 @@ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = 0; i < LGUEST_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ -- cgit v1.2.3 From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:02 -0600 Subject: lguest: improve interrupt handling, speed up stream networking lguest never checked for pending interrupts when enabling interrupts, and things still worked. However, it makes a significant difference to TCP performance, so it's time we fixed it by introducing a pending_irq flag and checking it on irq_restore and irq_enable. These two routines are now too big to patch into the 8/10 bytes patch space, so we drop that code. Note: The high latency on interrupt delivery had a very curious effect: once everything else was optimized, networking without GSO was faster than networking with GSO, since more interrupts were sent and hence a greater chance of one getting through to the Guest! Note2: (Almost) Closing the same loophole for iret doesn't have any measurable effect, so I'm leaving that patch for the moment. Before: 1GB tcpblast Guest->Host: 30.7 seconds 1GB tcpblast Guest->Host (no GSO): 76.0 seconds After: 1GB tcpblast Guest->Host: 6.8 seconds 1GB tcpblast Guest->Host (no GSO): 27.8 seconds Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 2 -- drivers/lguest/core.c | 7 ++++--- drivers/lguest/hypercalls.c | 4 ++++ drivers/lguest/interrupts_and_traps.c | 16 +++++++++++++--- drivers/lguest/lg.h | 4 ++-- include/linux/lguest.h | 4 ++++ 8 files changed, 43 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487b..f9a9f7811248 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -17,6 +17,7 @@ #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2392a7a171c2..37b8c1d3e022 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -205,6 +205,12 @@ PV_CALLEE_SAVE_REGS_THUNK(save_fl); static void restore_fl(unsigned long flags) { lguest_data.irq_enabled = flags; + mb(); + /* Null hcall forces interrupt delivery now, if irq_pending is + * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags + * enables interrupts. */ + if (flags & lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); } PV_CALLEE_SAVE_REGS_THUNK(restore_fl); @@ -219,6 +225,11 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); static void irq_enable(void) { lguest_data.irq_enabled = X86_EFLAGS_IF; + mb(); + /* Null hcall forces interrupt delivery now. */ + if (lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); + } PV_CALLEE_SAVE_REGS_THUNK(irq_enable); @@ -972,10 +983,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -986,8 +997,6 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..3e0c5545d59c 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -46,8 +46,6 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) /*:*/ diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 8ca1def5b142..03fbc88c0023 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -189,6 +189,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { unsigned int irq; + bool more; /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) @@ -213,9 +214,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ - irq = interrupt_pending(cpu); + irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) - try_deliver_interrupt(cpu, irq); + try_deliver_interrupt(cpu, irq, more); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -233,7 +234,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) set_current_state(TASK_INTERRUPTIBLE); /* Just before we sleep, make sure nothing snuck in * which we should be doing. */ - if (interrupt_pending(cpu) < LGUEST_IRQS + if (interrupt_pending(cpu, &more) < LGUEST_IRQS || cpu->break_out) set_current_state(TASK_RUNNING); else diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fefa..f252b71ae79e 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* This call does nothing, except by breaking out of the Guest * it makes us process all the asynchronous hypercalls. */ break; + case LHCALL_SEND_INTERRUPTS: + /* This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. */ + break; case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index a8c966fee1e4..5a10754b4790 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -131,7 +131,7 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before * we go to sleep when the Guest has halted itself. */ -unsigned int interrupt_pending(struct lg_cpu *cpu) +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); @@ -149,13 +149,14 @@ unsigned int interrupt_pending(struct lg_cpu *cpu) /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); + *more = find_next_bit(blk, LGUEST_IRQS, irq+1); return irq; } /* This actually diverts the Guest to running an interrupt handler, once an * interrupt has been identified by interrupt_pending(). */ -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; @@ -178,8 +179,12 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) u32 irq_enabled; if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) irq_enabled = 0; - if (!irq_enabled) + if (!irq_enabled) { + /* Make sure they know an IRQ is pending. */ + put_user(X86_EFLAGS_IF, + &cpu->lg->lguest_data->irq_pending); return; + } } /* Look at the IDT entry the Guest gave us for this interrupt. The @@ -202,6 +207,11 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) * here is a compromise which means at least it gets updated every * timer interrupt. */ write_timestamp(cpu); + + /* If there are no other interrupts we want to deliver, clear + * the pending flag. */ + if (!more) + put_user(0, &cpu->lg->lguest_data->irq_pending); } /*:*/ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 6743cf147d97..573896533ac9 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -139,8 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -unsigned int interrupt_pending(struct lg_cpu *cpu); -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq); +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 175e63f4a8c0..7bc1440fc473 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -30,6 +30,10 @@ struct lguest_data /* Wallclock time set by the Host. */ struct timespec time; + /* Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + int irq_pending; + /* Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. * This batching can be quite efficient. */ -- cgit v1.2.3 From 61f4bc83fea248a3092beb7ba43daa5629615513 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:03 -0600 Subject: lguest: optimize by coding restore_flags and irq_enable in assembler. The downside of the last patch which made restore_flags and irq_enable check interrupts is that they are now too big to be patched directly into the callsites, so the C versions are always used. But the C versions go via PV_CALLEE_SAVE_REGS_THUNK which saves all the registers. In fact, we don't need any registers in the fast path, so we can do better than this if we actually code them in assembler. The results are in the noise, but since it's about the same amount of code, it's worth applying. 1GB Guest->Host: input(suppressed),output(suppressed) Before: Seconds: 0:16.53 Packets: 377268,753673 Interrupts: 22461,24297 Notifications: 1(5245),21303(732370) Net IRQs triggered: 377023(245),42578(711095) After: Seconds: 0:16.48 Packets: 377289,753673 Interrupts: 22281,24465 Notifications: 1(5245),21296(732377) Net IRQs triggered: 377060(229),42564(711109) Signed-off-by: Rusty Russell --- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/lguest/boot.c | 45 +++++++++++-------------------- arch/x86/lguest/i386_head.S | 58 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd7015..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 37b8c1d3e022..514f4d0d2bfa 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -179,7 +179,7 @@ static void lguest_end_context_switch(struct task_struct *next) paravirt_end_context_switch(next); } -/*G:033 +/*G:032 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -199,41 +199,28 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); - -/* restore_flags() just sets the flags back to the value given. */ -static void restore_fl(unsigned long flags) -{ - lguest_data.irq_enabled = flags; - mb(); - /* Null hcall forces interrupt delivery now, if irq_pending is - * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags - * enables interrupts. */ - if (flags & lguest_data.irq_pending) - kvm_hypercall0(LHCALL_SEND_INTERRUPTS); -} -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } -PV_CALLEE_SAVE_REGS_THUNK(irq_disable); -/* Interrupts go on... */ -static void irq_enable(void) -{ - lguest_data.irq_enabled = X86_EFLAGS_IF; - mb(); - /* Null hcall forces interrupt delivery now. */ - if (lguest_data.irq_pending) - kvm_hypercall0(LHCALL_SEND_INTERRUPTS); +/* Let's pause a moment. Remember how I said these are called so often? + * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to + * break some rules. In particular, these functions are assumed to save their + * own registers if they need to: normal C functions assume they can trash the + * eax register. To use normal C functions, we use + * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the + * C function, then restores it. */ +PV_CALLEE_SAVE_REGS_THUNK(save_fl); +PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +/*:*/ -} -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/* These are in i386_head.S */ +extern void lg_irq_enable(void); +extern void lg_restore_fl(unsigned long flags); -/*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer @@ -1041,9 +1028,9 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 3e0c5545d59c..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -47,7 +47,63 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*:*/ + +/*G:033 But using those wrappers is inefficient (we'll see why that doesn't + * matter for save_fl and irq_disable later). If we write our routines + * carefully in assembler, we can avoid clobbering any registers and avoid + * jumping through the wrapper functions. + * + * I skipped over our first piece of assembler, but this one is worth studying + * in a bit more detail so I'll describe in easy stages. First, the routine + * to enable interrupts: */ +ENTRY(lg_irq_enable) + /* The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled + /* But now we need to check if the Host wants to know: there might have + * been interrupts waiting to be delivered, in which case it will have + * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we + * jump to send_interrupts, otherwise we're done. */ + testl $0, lguest_data+LGUEST_DATA_irq_pending + jnz send_interrupts + /* One cool thing about x86 is that you can do many things without using + * a register. In this case, the normal path hasn't needed to save or + * restore any registers at all! */ + ret +send_interrupts: + /* OK, now we need a register: eax is used for the hypercall number, + * which is LHCALL_SEND_INTERRUPTS. + * + * We used not to bother with this pending detection at all, which was + * much simpler. Sooner or later the Host would realize it had to + * send us an interrupt. But that turns out to make performance 7 + * times worse on a simple tcp benchmark. So now we do this the hard + * way. */ + pushl %eax + movl $LHCALL_SEND_INTERRUPTS, %eax + /* This is a vmcall instruction (same thing that KVM uses). Older + * assembler versions might not know the "vmcall" instruction, so we + * create one manually here. */ + .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + popl %eax + ret + +/* Finally, the "popf" or "restore flags" routine. The %eax register holds the + * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're + * enabling interrupts again, if it's 0 we're leaving them off. */ +ENTRY(lg_restore_fl) + /* This is just "lguest_data.irq_enabled = flags;" */ + movl %eax, lguest_data+LGUEST_DATA_irq_enabled + /* Now, if the %eax value has enabled interrupts and + * lguest_data.irq_pending is set, we want to tell the Host so it can + * deliver any outstanding interrupts. Fortunately, both values will + * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" + * instruction will AND them together for us. If both are set, we + * jump to send_interrupts. */ + testl lguest_data+LGUEST_DATA_irq_pending, %eax + jnz send_interrupts + /* Again, the normal path has used no extra registers. Clever, huh? */ + ret /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start -- cgit v1.2.3 From 90603d15fa95605d1d08235b73e220d766f04bb0 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:06 -0600 Subject: lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated Some cleanups and replace direct assignment with native_set_* macros which properly handle 64-bit entries when PAE is activated Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 8 ++++---- drivers/lguest/page_tables.c | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 514f4d0d2bfa..4f311e40d0aa 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -525,7 +525,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } @@ -534,9 +534,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, * changed. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } /* There are a couple of legacy places where the kernel sets a PTE, but we @@ -550,7 +550,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 496995370fbc..ffba723cd98d 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -90,7 +90,7 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); - return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; + return &page[pte_index(vaddr)]; } /* These two functions just like the above two, except they access the Guest @@ -105,7 +105,7 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); + return gpage + pte_index(vaddr) * sizeof(pte_t); } /*:*/ @@ -171,7 +171,7 @@ static void release_pte(pte_t pte) /* Remember that get_user_pages_fast() took a reference to the page, in * get_pfn()? We have to put it back now. */ if (pte_flags(pte) & _PAGE_PRESENT) - put_page(pfn_to_page(pte_pfn(pte))); + put_page(pte_page(pte)); } /*:*/ @@ -273,7 +273,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); + native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ @@ -323,7 +323,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) } /*H:450 If we chase down the release_pgd() code, it looks like this: */ -static void release_pgd(struct lguest *lg, pgd_t *spgd) +static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { @@ -350,7 +350,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) unsigned int i; /* Release every pgd entry up to the kernel's address. */ for (i = 0; i < pgd_index(lg->kernel_address); i++) - release_pgd(lg, lg->pgdirs[idx].pgdir + i); + release_pgd(lg->pgdirs[idx].pgdir + i); } /*H:440 (v) Flushing (throwing away) page tables, @@ -431,7 +431,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:430 (iv) Switching page tables * - * Now we've seen all the page table setting and manipulation, let's see what + * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) @@ -463,7 +463,7 @@ static void release_all_pagetables(struct lguest *lg) if (lg->pgdirs[i].pgdir) /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg, lg->pgdirs[i].pgdir + j); + release_pgd(lg->pgdirs[i].pgdir + j); } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -581,7 +581,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ - release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); + release_pgd(lg->pgdirs[pgdir].pgdir + idx); } /* Once we know how much memory we have we can construct simple identity @@ -726,8 +726,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * page is already mapped there, we don't have to copy them out * again. */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; + native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); + native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], + regs_pte); } /*:*/ @@ -752,21 +753,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - pte[i] = mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i], mk_pte(switcher_page[i], + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); + native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* The second page contains the "struct lguest_ro_state", and is * read-only. */ - pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* We've made it through the page table code. Perhaps our tired brains are -- cgit v1.2.3 From ebe0ba84f55950a89cb7af94c7ffc35ee3992f9e Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Sat, 30 May 2009 15:48:08 -0300 Subject: lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD replace LHCALL_SET_PMD with LHCALL_SET_PGD hypercall name (That's really what it is, and the confusion gets worse with PAE support) Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell Reported-by: Jeremy Fitzhardinge --- arch/x86/include/asm/lguest_hcall.h | 2 +- arch/x86/lguest/boot.c | 2 +- drivers/lguest/hypercalls.c | 4 ++-- drivers/lguest/lg.h | 2 +- drivers/lguest/page_tables.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index f9a9f7811248..05b9c198e4ba 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -13,7 +13,7 @@ #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4f311e40d0aa..943a75ef70b9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -535,7 +535,7 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index f252b71ae79e..51149ca14617 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -79,8 +79,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SET_PTE: guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); break; - case LHCALL_SET_PMD: - guest_set_pmd(cpu->lg, args->arg1, args->arg2); + case LHCALL_SET_PGD: + guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 74af503ad63c..cacc2da2058d 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -169,7 +169,7 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index ffba723cd98d..6a54d76b6236 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -568,7 +568,7 @@ void guest_set_pte(struct lg_cpu *cpu, * * So with that in mind here's our code to to update a (top-level) PGD entry: */ -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; -- cgit v1.2.3 From cefcad1773197523e11e18b669f245e6a8d32058 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: Add support for kvm_hypercall4() Add support for kvm_hypercall4(); PAE wants it. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 9 +++++---- arch/x86/lguest/boot.c | 26 ++++++++++++++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 05b9c198e4ba..b14b3552a4db 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -35,8 +35,8 @@ * * We use the KVM hypercall mechanism. Eighteen hypercalls are * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx and %edx. If a return - * value makes sense, it's returned in %eax. + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's @@ -48,8 +48,9 @@ #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3; + /* These map directly onto eax, ebx, ecx, edx and esi + * in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3, arg4; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 943a75ef70b9..d12f554e5f6a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3) + unsigned long arg2, unsigned long arg3, + unsigned long arg4) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall3(call, arg1, arg2, arg3); + kvm_hypercall4(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; + lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0); + async_hcall(call, arg1, 0, 0, 0); } static void lazy_hcall2(unsigned long call, @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0); + async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, @@ -162,7 +164,19 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3); + async_hcall(call, arg1, arg2, arg3, 0); +} + +static void lazy_hcall4(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + kvm_hypercall4(call, arg1, arg2, arg3, arg4); + else + async_hcall(call, arg1, arg2, arg3, arg4); } /* When lazy mode is turned off reset the per-cpu lazy mode variable and then -- cgit v1.2.3 From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: PAE support This version requires that host and guest have the same PAE status. NX cap is not offered to the guest, yet. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.txt | 1 - arch/x86/include/asm/lguest.h | 7 +- arch/x86/include/asm/lguest_hcall.h | 3 +- arch/x86/lguest/Kconfig | 1 - arch/x86/lguest/boot.c | 71 +++++++- drivers/lguest/Kconfig | 2 +- drivers/lguest/hypercalls.c | 10 + drivers/lguest/lg.h | 5 + drivers/lguest/page_tables.c | 351 ++++++++++++++++++++++++++++++++---- 9 files changed, 403 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f95..efb3a6a045a2 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -37,7 +37,6 @@ Running Lguest: "Paravirtualized guest support" = Y "Lguest guest support" = Y "High Memory Support" = off/4GB - "PAE (Physical Address Extension) Support" = N "Alignment value to which kernel should be aligned" = 0x100000 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and CONFIG_PHYSICAL_ALIGN=0x100000) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9c..313389cd50d2 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,13 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M when PAE is activated) for ease of mapping + * into the guest (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b14b3552a4db..d31c4a684078 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,6 +12,7 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 @@ -33,7 +34,7 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are + * We use the KVM hypercall mechanism. Seventeen hypercalls are * available: the hypercall number is put in the %eax register, and the * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. * If a return value makes sense, it's returned in %eax. diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d12f554e5f6a..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call, async_hcall(call, arg1, arg2, arg3, 0); } +#ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call, else async_hcall(call, arg1, arg2, arg3, arg4); } +#endif /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ @@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ + *dx &= 0x07808151; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; + case 0x80000001: + /* Here we should fix nx cap depending on host. */ + /* For this version of PAE, we just clear NX bit. */ + *dx &= ~(1 << 20); + break; } } @@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not + * activated. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } +#ifdef CONFIG_X86_PAE +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -1035,6 +1089,7 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; + pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1080,6 +1135,12 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..8f63845db830 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX + depends on X86_32 && EXPERIMENTAL && FUTEX select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 51149ca14617..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: +#ifdef CONFIG_X86_PAE + guest_set_pte(cpu, args->arg1, args->arg2, + __pte(args->arg3 | (u64)args->arg4 << 32)); +#else guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); +#endif break; case LHCALL_SET_PGD: guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; +#ifdef CONFIG_X86_PAE + case LHCALL_SET_PMD: + guest_set_pmd(cpu->lg, args->arg1, args->arg2); + break; +#endif case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); break; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index cacc2da2058d..6201ce59e886 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) +#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) +#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); @@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); +#ifdef CONFIG_X86_PAE +void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); +#endif void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 6a54d76b6236..5e2c26adcf06 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -53,6 +53,17 @@ * page. */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) +/* For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) +#define RESERVE_MEM 2U +#define CHECK_GPGD_MASK _PAGE_PRESENT +#else +#define RESERVE_MEM 4U +#define CHECK_GPGD_MASK _PAGE_TABLE +#endif + /* We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this * CPU's guest to see the pages of any other CPU. */ @@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); +#ifndef CONFIG_X86_PAE /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { kill_guest(cpu, "attempt to access switcher pages"); index = 0; } +#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } +#ifdef CONFIG_X86_PAE +/* This routine then takes the PGD entry given above, which contains the + * address of the PMD page. It then returns a pointer to the PMD entry for the + * given address. */ +static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) +{ + unsigned int index = pmd_index(vaddr); + pmd_t *page; + + /* We kill any Guest trying to touch the Switcher addresses. */ + if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && + index >= SWITCHER_PMD_INDEX) { + kill_guest(cpu, "attempt to access switcher pages"); + index = 0; + } + + /* You should never call this if the PGD entry wasn't valid */ + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); + page = __va(pgd_pfn(spgd) << PAGE_SHIFT); + + return &page[index]; +} +#endif + /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { +#ifdef CONFIG_X86_PAE + pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); + pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); + + /* You should never call this if the PMD entry wasn't valid */ + BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); +#else pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); +#endif + return &page[pte_index(vaddr)]; } @@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); } -static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) +#ifdef CONFIG_X86_PAE +static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); + return gpage + pmd_index(vaddr) * sizeof(pmd_t); +} +#endif + +static unsigned long gpte_addr(struct lg_cpu *cpu, + pgd_t gpgd, unsigned long vaddr) +{ +#ifdef CONFIG_X86_PAE + pmd_t gpmd; +#endif + unsigned long gpage; + + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + gpage = pmd_pfn(gpmd) << PAGE_SHIFT; + BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); +#else + gpage = pgd_pfn(gpgd) << PAGE_SHIFT; +#endif return gpage + pte_index(vaddr) * sizeof(pte_t); } /*:*/ @@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || + if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page directory entry"); } +#ifdef CONFIG_X86_PAE +static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +{ + if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + kill_guest(cpu, "bad page middle directory entry"); +} +#endif + /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; + pmd_t gpmd; +#endif + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) check_gpgd(cpu, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* Now look at the matching shadow entry. */ + spmd = spmd_addr(cpu, *spgd, vaddr); + + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + + /* This is not really the Guest's fault, but killing it is + * simple for this corner case. */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pte page"); + return false; + } + + /* We check that the Guest pmd is OK. */ + check_gpmd(cpu, gpmd); + + /* And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. */ + native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + } +#endif /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(gpgd, vaddr); + gpte_ptr = gpte_addr(cpu, gpgd, vaddr); gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(*spgd, vaddr); + spte = spte_addr(cpu, *spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) pgd_t *spgd; unsigned long flags; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return false; +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + return false; +#endif + /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(*spgd, vaddr))); + flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) kill_guest(cpu, "bad stack page %#lx", vaddr); } +#ifdef CONFIG_X86_PAE +static void release_pmd(pmd_t *spmd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pmd_flags(*spmd) & _PAGE_PRESENT) { + unsigned int i; + pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); + /* For each entry in the page, we might need to release it. */ + for (i = 0; i < PTRS_PER_PTE; i++) + release_pte(ptepage[i]); + /* Now we can free the page of PTEs */ + free_page((long)ptepage); + /* And zero out the PMD entry so we never release it twice. */ + native_set_pmd(spmd, __pmd(0)); + } +} + +static void release_pgd(pgd_t *spgd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pgd_flags(*spgd) & _PAGE_PRESENT) { + unsigned int i; + pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + for (i = 0; i < PTRS_PER_PMD; i++) + release_pmd(&pmdpage[i]); + + /* Now we can free the page of PMDs */ + free_page((long)pmdpage); + /* And zero out the PGD entry so we never release it twice. */ + set_pgd(spgd, __pgd(0)); + } +} + +#else /* !CONFIG_X86_PAE */ /*H:450 If we chase down the release_pgd() code, it looks like this: */ static void release_pgd(pgd_t *spgd) { @@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd) *spgd = __pgd(0); } } - +#endif /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. * It simply releases every PTE page from 0 up to the Guest's kernel address. */ @@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) pgd_t gpgd; pte_t gpte; +#ifdef CONFIG_X86_PAE + pmd_t gpmd; +#endif /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return -1UL; } - gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); + gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + kill_guest(cpu, "Bad address %#lx", vaddr); +#endif + gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); @@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; +#ifdef CONFIG_X86_PAE + pmd_t *pmd_table; +#endif /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ @@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* If the allocation fails, just keep using the one we have */ if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; - else - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + else { +#ifdef CONFIG_X86_PAE + /* In PAE mode, allocate a pmd page and populate the + * last pgd entry. */ + pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd_table) { + free_page((long)cpu->lg->pgdirs[next].pgdir); + set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); + next = cpu->cpu_pgd; + } else { + set_pgd(cpu->lg->pgdirs[next].pgdir + + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + /* This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ + *blank_pgdir = 1; + } +#else *blank_pgdir = 1; +#endif + } } /* Record which Guest toplevel this shadows. */ cpu->lg->pgdirs[next].gpgdir = gpgdir; @@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg) /* Every shadow pagetable this Guest has */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) + if (lg->pgdirs[i].pgdir) { +#ifdef CONFIG_X86_PAE + pgd_t *spgd; + pmd_t *pmdpage; + unsigned int k; + + /* Get the last pmd page. */ + spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; + pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + /* And release the pmd entries of that pmd page, + * except for the switcher pmd. */ + for (k = 0; k < SWITCHER_PMD_INDEX; k++) + release_pmd(&pmdpage[k]); +#endif /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) release_pgd(lg->pgdirs[i].pgdir + j); + } } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, { /* Look up the matching shadow page directory entry. */ pgd_t *spgd = spgd_addr(cpu, idx, vaddr); +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { - /* Otherwise, we start by releasing the existing entry. */ - pte_t *spte = spte_addr(*spgd, vaddr); - release_pte(*spte); - - /* If they're setting this entry as dirty or accessed, we might - * as well put that entry they've given us in now. This shaves - * 10% off a copy-on-write micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); - *spte = gpte_to_spte(cpu, gpte, - pte_flags(gpte) & _PAGE_DIRTY); - } else - /* Otherwise kill it and we can demand_page() it in - * later. */ - *spte = __pte(0); +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (pmd_flags(*spmd) & _PAGE_PRESENT) { +#endif + /* Otherwise, we start by releasing + * the existing entry. */ + pte_t *spte = spte_addr(cpu, *spgd, vaddr); + release_pte(*spte); + + /* If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us + * in now. This shaves 10% off a + * copy-on-write micro-benchmark. */ + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + check_gpte(cpu, gpte); + native_set_pte(spte, + gpte_to_spte(cpu, gpte, + pte_flags(gpte) & _PAGE_DIRTY)); + } else + /* Otherwise kill it and we can demand_page() + * it in later. */ + native_set_pte(spte, __pte(0)); +#ifdef CONFIG_X86_PAE + } +#endif } } @@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - /* The kernel seems to try to initialize this early on: we ignore its - * attempts to map over the Switcher. */ if (idx >= SWITCHER_PGD_INDEX) return; @@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } +#ifdef CONFIG_X86_PAE +void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) +{ + guest_pagetable_clear_all(&lg->cpus[0]); +} +#endif /* Once we know how much memory we have we can construct simple identity * (which set virtual == physical) and linear mappings @@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg, { pgd_t __user *pgdir; pte_t __user *linear; - unsigned int mapped_pages, i, linear_pages, phys_linear; unsigned long mem_base = (unsigned long)lg->mem_base; + unsigned int mapped_pages, i, linear_pages; +#ifdef CONFIG_X86_PAE + pmd_t __user *pmds; + unsigned int j; + pgd_t pgd; + pmd_t pmd; +#else + unsigned int phys_linear; +#endif /* We have mapped_pages frames to map, so we need * linear_pages page tables to map them. */ @@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg, /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages * PAGE_SIZE; +#ifdef CONFIG_X86_PAE + pmds = (void *)linear - PAGE_SIZE; +#endif /* Linear mapping is easy: put every page's address into the * mapping in order. */ for (i = 0; i < mapped_pages; i++) { @@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg, /* The top level points to the linear page table pages above. * We setup the identity and linear mappings here. */ +#ifdef CONFIG_X86_PAE + for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD; + i += PTRS_PER_PTE, j++) { + native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) + - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + + if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) + return -EFAULT; + } + + set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) + return -EFAULT; + if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) + return -EFAULT; +#else phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; @@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg, &pgd, sizeof(pgd))) return -EFAULT; } +#endif /* We return the top level (guest-physical) address: remember where * this is. */ @@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg) u64 mem; u32 initrd_size; struct boot_params __user *boot = (struct boot_params *)lg->mem_base; - +#ifdef CONFIG_X86_PAE + pgd_t *pgd; + pmd_t *pmd_table; +#endif /* Get the Guest memory size and the ramdisk size from the boot header * located at lg->mem_base (Guest address 0). */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) @@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; +#ifdef CONFIG_X86_PAE + pgd = lg->pgdirs[0].pgdir; + pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); + if (!pmd_table) + return -ENOMEM; + + set_pgd(pgd + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); +#endif lg->cpus[0].cpu_pgd = 0; return 0; } @@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 4MB of virtual - * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. */ + || put_user(RESERVE_MEM * 1024 * 1024, + &cpu->lg->lguest_data->reserve_mem) + || put_user(cpu->lg->pgdirs[0].gpgdir, + &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ +#ifdef CONFIG_X86_PAE + if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && + pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) +#else if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) +#endif kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); - pgd_t switcher_pgd; pte_t regs_pte; unsigned long pfn; +#ifdef CONFIG_X86_PAE + pmd_t switcher_pmd; + pmd_t *pmd_table; + + native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> + PAGE_SHIFT, PAGE_KERNEL_EXEC)); + + pmd_table = __va(pgd_pfn(cpu->lg-> + pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) + << PAGE_SHIFT); + native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); +#else + pgd_t switcher_pgd; + /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; +#endif /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher -- cgit v1.2.3 From ce4b3c55475e451cb489e857640396c37ca88974 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 18 Apr 2009 13:44:57 +0200 Subject: PM/ACPI/x86: Fix sparse warning in arch/x86/kernel/acpi/sleep.c One of the numbers in arch/x86/kernel/acpi/sleep.c is long, but it is not annotated appropriately, so sparese warns about it. Fix that. [rjw: added the changelog.] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c5115..ca93638ba430 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -104,7 +104,7 @@ int acpi_save_state_mem(void) initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ return 0; -- cgit v1.2.3 From f6783d20d4b85b360b4a6f86bdbd9282a4a7004c Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:26:22 +0200 Subject: x86: unify power/cpu_(32|64) headers First step towards the unification of cpu_32.c and cpu_64.c. This commit unifies the headers of such files, making both of them use the same header files. It also remove the uneeded . Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu_32.c | 6 +++++- arch/x86/power/cpu_64.c | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index ce702c5b3a2c..12a9c871d2ec 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -7,9 +7,13 @@ * Copyright (c) 2001 Patrick Mochel */ -#include #include +#include + +#include +#include #include +#include #include #include #include diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 5343540f2607..39b27b72e3c5 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -8,12 +8,14 @@ * Copyright (c) 2001 Patrick Mochel */ -#include #include -#include -#include +#include + #include +#include #include +#include +#include #include #include -- cgit v1.2.3 From 833b2ca0795526898a66c7b6770273bb16567e19 Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:26:50 +0200 Subject: x86: unify power/cpu_(32|64) global variables Aiming total unification of cpu_32.c and cpu_64.c, in this step we do unify the global variables and existing forward declarations for such files. Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu_32.c | 7 +++++++ arch/x86/power/cpu_64.c | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 12a9c871d2ec..de1a86b2cffa 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -18,12 +18,19 @@ #include #include +#ifdef CONFIG_X86_32 static struct saved_context saved_context; unsigned long saved_context_ebx; unsigned long saved_context_esp, saved_context_ebp; unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; +#else +/* CONFIG_X86_64 */ +static void fix_processor_context(void); + +struct saved_context saved_context; +#endif static void __save_processor_state(struct saved_context *ctxt) { diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 39b27b72e3c5..6ce0eca847c3 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -19,9 +19,19 @@ #include #include +#ifdef CONFIG_X86_32 +static struct saved_context saved_context; + +unsigned long saved_context_ebx; +unsigned long saved_context_esp, saved_context_ebp; +unsigned long saved_context_esi, saved_context_edi; +unsigned long saved_context_eflags; +#else +/* CONFIG_X86_64 */ static void fix_processor_context(void); struct saved_context saved_context; +#endif /** * __save_processor_state - save CPU registers before creating a -- cgit v1.2.3 From f9ebbe53e79c5978d0e8ead0843a3717b41ad3d5 Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:27:00 +0200 Subject: x86: unify power/cpu_(32|64) regarding saving processor state In this step we do unify cpu_32.c and cpu_64.c functions that work on saving the processor state. Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu_32.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/power/cpu_64.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index de1a86b2cffa..294e78baff75 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -32,25 +32,65 @@ static void fix_processor_context(void); struct saved_context saved_context; #endif +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ static void __save_processor_state(struct saved_context *ctxt) { +#ifdef CONFIG_X86_32 mtrr_save_fixed_ranges(NULL); +#endif kernel_fpu_begin(); /* * descriptor tables */ +#ifdef CONFIG_X86_32 store_gdt(&ctxt->gdt); store_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ + store_gdt((struct desc_ptr *)&ctxt->gdt_limit); + store_idt((struct desc_ptr *)&ctxt->idt_limit); +#endif store_tr(ctxt->tr); + /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* * segment registers */ +#ifdef CONFIG_X86_32 savesegment(es, ctxt->es); savesegment(fs, ctxt->fs); savesegment(gs, ctxt->gs); savesegment(ss, ctxt->ss); +#else +/* CONFIG_X86_64 */ + asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + rdmsrl(MSR_FS_BASE, ctxt->fs_base); + rdmsrl(MSR_GS_BASE, ctxt->gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); + + rdmsrl(MSR_EFER, ctxt->efer); +#endif /* * control registers @@ -58,7 +98,13 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); +#ifdef CONFIG_X86_32 ctxt->cr4 = read_cr4_safe(); +#else +/* CONFIG_X86_64 */ + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); +#endif } /* Needed by apm.c */ @@ -66,7 +112,9 @@ void save_processor_state(void) { __save_processor_state(&saved_context); } +#ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); +#endif static void do_fpu_end(void) { diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 6ce0eca847c3..11ea7d0ba5d9 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -50,19 +50,35 @@ struct saved_context saved_context; */ static void __save_processor_state(struct saved_context *ctxt) { +#ifdef CONFIG_X86_32 + mtrr_save_fixed_ranges(NULL); +#endif kernel_fpu_begin(); /* * descriptor tables */ +#ifdef CONFIG_X86_32 + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ store_gdt((struct desc_ptr *)&ctxt->gdt_limit); store_idt((struct desc_ptr *)&ctxt->idt_limit); +#endif store_tr(ctxt->tr); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* * segment registers */ +#ifdef CONFIG_X86_32 + savesegment(es, ctxt->es); + savesegment(fs, ctxt->fs); + savesegment(gs, ctxt->gs); + savesegment(ss, ctxt->ss); +#else +/* CONFIG_X86_64 */ asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); @@ -74,21 +90,32 @@ static void __save_processor_state(struct saved_context *ctxt) rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); mtrr_save_fixed_ranges(NULL); + rdmsrl(MSR_EFER, ctxt->efer); +#endif + /* * control registers */ - rdmsrl(MSR_EFER, ctxt->efer); ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); +#ifdef CONFIG_X86_32 + ctxt->cr4 = read_cr4_safe(); +#else +/* CONFIG_X86_64 */ ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); +#endif } +/* Needed by apm.c */ void save_processor_state(void) { __save_processor_state(&saved_context); } +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(save_processor_state); +#endif static void do_fpu_end(void) { -- cgit v1.2.3 From 3134d04b7790f7239b221f16c2d97db4d96ac3c0 Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:27:05 +0200 Subject: x86: unify power/cpu_(32|64) regarding restoring processor state In this step we do unify cpu_32.c and cpu_64.c functions that work on restoring the saved processor state. Also, we do eliminate the forward declaration of fix_processor_context() for X86_64, as it's not needed anymore. Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu_32.c | 55 +++++++++++++++++++++- arch/x86/power/cpu_64.c | 118 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 135 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 294e78baff75..29b9c0a1ca62 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -27,8 +27,6 @@ unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; #else /* CONFIG_X86_64 */ -static void fix_processor_context(void); - struct saved_context saved_context; #endif @@ -136,6 +134,11 @@ static void fix_processor_context(void) * similar stupidity. */ +#ifdef CONFIG_X86_64 + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ +#endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ @@ -143,6 +146,7 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg7) { +#ifdef CONFIG_X86_32 set_debugreg(current->thread.debugreg0, 0); set_debugreg(current->thread.debugreg1, 1); set_debugreg(current->thread.debugreg2, 2); @@ -150,18 +154,40 @@ static void fix_processor_context(void) /* no 4 and 5 */ set_debugreg(current->thread.debugreg6, 6); set_debugreg(current->thread.debugreg7, 7); +#else + /* CONFIG_X86_64 */ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); +#endif } } +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers */ /* cr4 was introduced in the Pentium CPU */ +#ifdef CONFIG_X86_32 if (ctxt->cr4) write_cr4(ctxt->cr4); +#else +/* CONFIG X86_64 */ + wrmsrl(MSR_EFER, ctxt->efer); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); +#endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); @@ -170,12 +196,19 @@ static void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ +#ifdef CONFIG_X86_32 load_gdt(&ctxt->gdt); load_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ + load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); + load_idt((const struct desc_ptr *)&ctxt->idt_limit); +#endif /* * segment registers */ +#ifdef CONFIG_X86_32 loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); loadsegment(gs, ctxt->gs); @@ -186,6 +219,18 @@ static void __restore_processor_state(struct saved_context *ctxt) */ if (boot_cpu_has(X86_FEATURE_SEP)) enable_sep_cpu(); +#else +/* CONFIG_X86_64 */ + asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + load_gs_index(ctxt->gs); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + wrmsrl(MSR_FS_BASE, ctxt->fs_base); + wrmsrl(MSR_GS_BASE, ctxt->gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +#endif /* * restore XCR0 for xsave capable cpu's. @@ -194,9 +239,13 @@ static void __restore_processor_state(struct saved_context *ctxt) xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); fix_processor_context(); + do_fpu_end(); mtrr_ap_init(); + +#ifdef CONFIG_X86_32 mcheck_init(&boot_cpu_data); +#endif } /* Needed by apm.c */ @@ -204,4 +253,6 @@ void restore_processor_state(void) { __restore_processor_state(&saved_context); } +#ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); +#endif diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 11ea7d0ba5d9..5c8bdc002b8b 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -28,8 +28,6 @@ unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; #else /* CONFIG_X86_64 */ -static void fix_processor_context(void); - struct saved_context saved_context; #endif @@ -120,11 +118,57 @@ EXPORT_SYMBOL(save_processor_state); static void do_fpu_end(void) { /* - * Restore FPU regs if necessary + * Restore FPU regs if necessary. */ kernel_fpu_end(); } +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu, t); /* + * This just modifies memory; should not be + * necessary. But... This is necessary, because + * 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + +#ifdef CONFIG_X86_64 + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ +#endif + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7) { +#ifdef CONFIG_X86_32 + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); +#else + /* CONFIG_X86_64 */ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); +#endif + } + +} + /** * __restore_processor_state - restore the contents of CPU registers saved * by __save_processor_state() @@ -135,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + /* cr4 was introduced in the Pentium CPU */ +#ifdef CONFIG_X86_32 + if (ctxt->cr4) + write_cr4(ctxt->cr4); +#else +/* CONFIG X86_64 */ wrmsrl(MSR_EFER, ctxt->efer); write_cr8(ctxt->cr8); write_cr4(ctxt->cr4); +#endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); @@ -146,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ +#ifdef CONFIG_X86_32 + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); load_idt((const struct desc_ptr *)&ctxt->idt_limit); - +#endif /* * segment registers */ +#ifdef CONFIG_X86_32 + loadsegment(es, ctxt->es); + loadsegment(fs, ctxt->fs); + loadsegment(gs, ctxt->gs); + loadsegment(ss, ctxt->ss); + + /* + * sysenter MSRs + */ + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); +#else +/* CONFIG_X86_64 */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); @@ -162,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_FS_BASE, ctxt->fs_base); wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +#endif /* * restore XCR0 for xsave capable cpu's. @@ -173,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); + +#ifdef CONFIG_X86_32 + mcheck_init(&boot_cpu_data); +#endif } +/* Needed by apm.c */ void restore_processor_state(void) { __restore_processor_state(&saved_context); } - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - /* - * This just modifies memory; should not be necessary. But... This - * is necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - set_tss_desc(cpu, t); - - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } -} +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(restore_processor_state); +#endif -- cgit v1.2.3 From 6d48becd33a7921694ba1955ba91604d648020f1 Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:27:18 +0200 Subject: x86: unify power/cpu_(32|64) copyright notes In this step, we do unify the copyright notes for both files cpu_32.c and cpu_64.c, making such files exactly the same. It's the last step before the actual unification, that will rename one of them to cpu.c and remove the other one. Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu_32.c | 3 ++- arch/x86/power/cpu_64.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 29b9c0a1ca62..ca0ecaed7d4c 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -1,8 +1,9 @@ /* - * Suspend support specific for i386. + * Suspend and hibernation support for i386/x86-64. * * Distribute under GPLv2 * + * Copyright (c) 2007 Rafael J. Wysocki * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 5c8bdc002b8b..d277ef1eea51 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -1,5 +1,5 @@ /* - * Suspend and hibernation support for x86-64 + * Suspend support specific for i386/x86-64. * * Distribute under GPLv2 * -- cgit v1.2.3 From 08687aec71bc9134fe336e561f6877bacf74fc0a Mon Sep 17 00:00:00 2001 From: Sergio Luis Date: Tue, 28 Apr 2009 00:27:22 +0200 Subject: x86: unify power/cpu_(32|64).c This is the last unification step. Here we do remove one of the files and rename the left one as cpu.c, as both are now the same. Also update power/Makefile, telling it to build cpu.o, instead of cpu_(32|64).o Signed-off-by: Sergio Luis Signed-off-by: Lauro Salmito Signed-off-by: Rafael J. Wysocki --- arch/x86/power/Makefile | 2 +- arch/x86/power/cpu.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/power/cpu_32.c | 259 ------------------------------------------------ arch/x86/power/cpu_64.c | 259 ------------------------------------------------ 4 files changed, 260 insertions(+), 519 deletions(-) create mode 100644 arch/x86/power/cpu.c delete mode 100644 arch/x86/power/cpu_32.c delete mode 100644 arch/x86/power/cpu_64.c (limited to 'arch/x86') diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 58b32db33125..de2abbd07544 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -3,5 +3,5 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_cpu_$(BITS).o := $(nostackp) -obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o +obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c new file mode 100644 index 000000000000..d277ef1eea51 --- /dev/null +++ b/arch/x86/power/cpu.c @@ -0,0 +1,259 @@ +/* + * Suspend support specific for i386/x86-64. + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +static struct saved_context saved_context; + +unsigned long saved_context_ebx; +unsigned long saved_context_esp, saved_context_ebp; +unsigned long saved_context_esi, saved_context_edi; +unsigned long saved_context_eflags; +#else +/* CONFIG_X86_64 */ +struct saved_context saved_context; +#endif + +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) +{ +#ifdef CONFIG_X86_32 + mtrr_save_fixed_ranges(NULL); +#endif + kernel_fpu_begin(); + + /* + * descriptor tables + */ +#ifdef CONFIG_X86_32 + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ + store_gdt((struct desc_ptr *)&ctxt->gdt_limit); + store_idt((struct desc_ptr *)&ctxt->idt_limit); +#endif + store_tr(ctxt->tr); + + /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ + /* + * segment registers + */ +#ifdef CONFIG_X86_32 + savesegment(es, ctxt->es); + savesegment(fs, ctxt->fs); + savesegment(gs, ctxt->gs); + savesegment(ss, ctxt->ss); +#else +/* CONFIG_X86_64 */ + asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + rdmsrl(MSR_FS_BASE, ctxt->fs_base); + rdmsrl(MSR_GS_BASE, ctxt->gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); + + rdmsrl(MSR_EFER, ctxt->efer); +#endif + + /* + * control registers + */ + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); +#ifdef CONFIG_X86_32 + ctxt->cr4 = read_cr4_safe(); +#else +/* CONFIG_X86_64 */ + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); +#endif +} + +/* Needed by apm.c */ +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(save_processor_state); +#endif + +static void do_fpu_end(void) +{ + /* + * Restore FPU regs if necessary. + */ + kernel_fpu_end(); +} + +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu, t); /* + * This just modifies memory; should not be + * necessary. But... This is necessary, because + * 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + +#ifdef CONFIG_X86_64 + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ +#endif + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7) { +#ifdef CONFIG_X86_32 + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); +#else + /* CONFIG_X86_64 */ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); +#endif + } + +} + +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) +{ + /* + * control registers + */ + /* cr4 was introduced in the Pentium CPU */ +#ifdef CONFIG_X86_32 + if (ctxt->cr4) + write_cr4(ctxt->cr4); +#else +/* CONFIG X86_64 */ + wrmsrl(MSR_EFER, ctxt->efer); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); +#endif + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ +#ifdef CONFIG_X86_32 + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ + load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); + load_idt((const struct desc_ptr *)&ctxt->idt_limit); +#endif + + /* + * segment registers + */ +#ifdef CONFIG_X86_32 + loadsegment(es, ctxt->es); + loadsegment(fs, ctxt->fs); + loadsegment(gs, ctxt->gs); + loadsegment(ss, ctxt->ss); + + /* + * sysenter MSRs + */ + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); +#else +/* CONFIG_X86_64 */ + asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + load_gs_index(ctxt->gs); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + wrmsrl(MSR_FS_BASE, ctxt->fs_base); + wrmsrl(MSR_GS_BASE, ctxt->gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +#endif + + /* + * restore XCR0 for xsave capable cpu's. + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + + fix_processor_context(); + + do_fpu_end(); + mtrr_ap_init(); + +#ifdef CONFIG_X86_32 + mcheck_init(&boot_cpu_data); +#endif +} + +/* Needed by apm.c */ +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(restore_processor_state); +#endif diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c deleted file mode 100644 index ca0ecaed7d4c..000000000000 --- a/arch/x86/power/cpu_32.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Suspend and hibernation support for i386/x86-64. - * - * Distribute under GPLv2 - * - * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek - * Copyright (c) 2001 Patrick Mochel - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_X86_32 -static struct saved_context saved_context; - -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; -#else -/* CONFIG_X86_64 */ -struct saved_context saved_context; -#endif - -/** - * __save_processor_state - save CPU registers before creating a - * hibernation image and before restoring the memory state from it - * @ctxt - structure to store the registers contents in - * - * NOTE: If there is a CPU register the modification of which by the - * boot kernel (ie. the kernel used for loading the hibernation image) - * might affect the operations of the restored target kernel (ie. the one - * saved in the hibernation image), then its contents must be saved by this - * function. In other words, if kernel A is hibernated and different - * kernel B is used for loading the hibernation image into memory, the - * kernel A's __save_processor_state() function must save all registers - * needed by kernel A, so that it can operate correctly after the resume - * regardless of what kernel B does in the meantime. - */ -static void __save_processor_state(struct saved_context *ctxt) -{ -#ifdef CONFIG_X86_32 - mtrr_save_fixed_ranges(NULL); -#endif - kernel_fpu_begin(); - - /* - * descriptor tables - */ -#ifdef CONFIG_X86_32 - store_gdt(&ctxt->gdt); - store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_gdt((struct desc_ptr *)&ctxt->gdt_limit); - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif - store_tr(ctxt->tr); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* - * segment registers - */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); - savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - mtrr_save_fixed_ranges(NULL); - - rdmsrl(MSR_EFER, ctxt->efer); -#endif - - /* - * control registers - */ - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); -#ifdef CONFIG_X86_32 - ctxt->cr4 = read_cr4_safe(); -#else -/* CONFIG_X86_64 */ - ctxt->cr4 = read_cr4(); - ctxt->cr8 = read_cr8(); -#endif -} - -/* Needed by apm.c */ -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} -#ifdef CONFIG_X86_32 -EXPORT_SYMBOL(save_processor_state); -#endif - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary. - */ - kernel_fpu_end(); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - -#ifdef CONFIG_X86_64 - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ -#endif - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { -#ifdef CONFIG_X86_32 - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); -#else - /* CONFIG_X86_64 */ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); -#endif - } - -} - -/** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from - */ -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - /* cr4 was introduced in the Pentium CPU */ -#ifdef CONFIG_X86_32 - if (ctxt->cr4) - write_cr4(ctxt->cr4); -#else -/* CONFIG X86_64 */ - wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); -#endif - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ -#ifdef CONFIG_X86_32 - load_gdt(&ctxt->gdt); - load_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); - load_idt((const struct desc_ptr *)&ctxt->idt_limit); -#endif - - /* - * segment registers - */ -#ifdef CONFIG_X86_32 - loadsegment(es, ctxt->es); - loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); -#endif - - /* - * restore XCR0 for xsave capable cpu's. - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); - -#ifdef CONFIG_X86_32 - mcheck_init(&boot_cpu_data); -#endif -} - -/* Needed by apm.c */ -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} -#ifdef CONFIG_X86_32 -EXPORT_SYMBOL(restore_processor_state); -#endif diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c deleted file mode 100644 index d277ef1eea51..000000000000 --- a/arch/x86/power/cpu_64.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Suspend support specific for i386/x86-64. - * - * Distribute under GPLv2 - * - * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek - * Copyright (c) 2001 Patrick Mochel - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_X86_32 -static struct saved_context saved_context; - -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; -#else -/* CONFIG_X86_64 */ -struct saved_context saved_context; -#endif - -/** - * __save_processor_state - save CPU registers before creating a - * hibernation image and before restoring the memory state from it - * @ctxt - structure to store the registers contents in - * - * NOTE: If there is a CPU register the modification of which by the - * boot kernel (ie. the kernel used for loading the hibernation image) - * might affect the operations of the restored target kernel (ie. the one - * saved in the hibernation image), then its contents must be saved by this - * function. In other words, if kernel A is hibernated and different - * kernel B is used for loading the hibernation image into memory, the - * kernel A's __save_processor_state() function must save all registers - * needed by kernel A, so that it can operate correctly after the resume - * regardless of what kernel B does in the meantime. - */ -static void __save_processor_state(struct saved_context *ctxt) -{ -#ifdef CONFIG_X86_32 - mtrr_save_fixed_ranges(NULL); -#endif - kernel_fpu_begin(); - - /* - * descriptor tables - */ -#ifdef CONFIG_X86_32 - store_gdt(&ctxt->gdt); - store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_gdt((struct desc_ptr *)&ctxt->gdt_limit); - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif - store_tr(ctxt->tr); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* - * segment registers - */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); - savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - mtrr_save_fixed_ranges(NULL); - - rdmsrl(MSR_EFER, ctxt->efer); -#endif - - /* - * control registers - */ - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); -#ifdef CONFIG_X86_32 - ctxt->cr4 = read_cr4_safe(); -#else -/* CONFIG_X86_64 */ - ctxt->cr4 = read_cr4(); - ctxt->cr8 = read_cr8(); -#endif -} - -/* Needed by apm.c */ -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} -#ifdef CONFIG_X86_32 -EXPORT_SYMBOL(save_processor_state); -#endif - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary. - */ - kernel_fpu_end(); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - -#ifdef CONFIG_X86_64 - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ -#endif - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { -#ifdef CONFIG_X86_32 - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); -#else - /* CONFIG_X86_64 */ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); -#endif - } - -} - -/** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from - */ -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - /* cr4 was introduced in the Pentium CPU */ -#ifdef CONFIG_X86_32 - if (ctxt->cr4) - write_cr4(ctxt->cr4); -#else -/* CONFIG X86_64 */ - wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); -#endif - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ -#ifdef CONFIG_X86_32 - load_gdt(&ctxt->gdt); - load_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); - load_idt((const struct desc_ptr *)&ctxt->idt_limit); -#endif - - /* - * segment registers - */ -#ifdef CONFIG_X86_32 - loadsegment(es, ctxt->es); - loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); -#endif - - /* - * restore XCR0 for xsave capable cpu's. - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); - -#ifdef CONFIG_X86_32 - mcheck_init(&boot_cpu_data); -#endif -} - -/* Needed by apm.c */ -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} -#ifdef CONFIG_X86_32 -EXPORT_SYMBOL(restore_processor_state); -#endif -- cgit v1.2.3 From e39a71ef80877f4e30d808af9acceec80f4d2f7c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 15 May 2009 00:53:26 +0200 Subject: PM: Rename device_power_down/up() Rename the functions performing "_noirq" dev_pm_ops operations from device_power_down() and device_power_up() to device_suspend_noirq() and device_resume_noirq(). The new function names are chosen to show that the functions are responsible for calling the _noirq() versions to finalize the suspend/resume operation. The current function names do not perform power down/up anymore so the names may be misleading. Global function renames: - device_power_down() -> device_suspend_noirq() - device_power_up() -> device_resume_noirq() Static function renames: - suspend_device_noirq() -> __device_suspend_noirq() - resume_device_noirq() -> __device_resume_noirq() Signed-off-by: Magnus Damm Acked-by: Greg Kroah-Hartman Acked-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/apm_32.c | 8 ++++---- drivers/base/power/main.c | 26 +++++++++++++------------- drivers/xen/manage.c | 10 +++++----- include/linux/pm.h | 4 ++-- kernel/kexec.c | 8 ++++---- kernel/power/disk.c | 16 ++++++++-------- kernel/power/main.c | 4 ++-- 7 files changed, 38 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac42..31ae547da159 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1235,7 +1235,7 @@ static int suspend(int vetoable) device_suspend(PMSG_SUSPEND); - device_power_down(PMSG_SUSPEND); + device_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1259,7 +1259,7 @@ static int suspend(int vetoable) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); device_resume(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); @@ -1277,7 +1277,7 @@ static void standby(void) { int err; - device_power_down(PMSG_SUSPEND); + device_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); } static apm_event_t get_event(void) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 3e4bc699bc0f..c5a35bc9d63b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info, /*------------------------- Resume routines -------------------------*/ /** - * resume_device_noirq - Power on one device (early resume). + * __device_resume_noirq - Power on one device (early resume). * @dev: Device. * @state: PM transition of the system being carried out. * * Must be called with interrupts disabled. */ -static int resume_device_noirq(struct device *dev, pm_message_t state) +static int __device_resume_noirq(struct device *dev, pm_message_t state) { int error = 0; @@ -363,7 +363,7 @@ static void dpm_power_up(pm_message_t state) int error; dev->power.status = DPM_OFF; - error = resume_device_noirq(dev, state); + error = __device_resume_noirq(dev, state); if (error) pm_dev_err(dev, state, " early", error); } @@ -371,18 +371,18 @@ static void dpm_power_up(pm_message_t state) } /** - * device_power_up - Turn on all devices that need special attention. + * device_resume_noirq - Turn on all devices that need special attention. * @state: PM transition of the system being carried out. * * Call the "early" resume handlers and enable device drivers to receive * interrupts. */ -void device_power_up(pm_message_t state) +void device_resume_noirq(pm_message_t state) { dpm_power_up(state); resume_device_irqs(); } -EXPORT_SYMBOL_GPL(device_power_up); +EXPORT_SYMBOL_GPL(device_resume_noirq); /** * resume_device - Restore state for one device. @@ -577,13 +577,13 @@ static pm_message_t resume_event(pm_message_t sleep_state) } /** - * suspend_device_noirq - Shut down one device (late suspend). + * __device_suspend_noirq - Shut down one device (late suspend). * @dev: Device. * @state: PM transition of the system being carried out. * * This is called with interrupts off and only a single CPU running. */ -static int suspend_device_noirq(struct device *dev, pm_message_t state) +static int __device_suspend_noirq(struct device *dev, pm_message_t state) { int error = 0; @@ -602,7 +602,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state) } /** - * device_power_down - Shut down special devices. + * device_suspend_noirq - Shut down special devices. * @state: PM transition of the system being carried out. * * Prevent device drivers from receiving interrupts and call the "late" @@ -610,7 +610,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state) * * Must be called under dpm_list_mtx. */ -int device_power_down(pm_message_t state) +int device_suspend_noirq(pm_message_t state) { struct device *dev; int error = 0; @@ -618,7 +618,7 @@ int device_power_down(pm_message_t state) suspend_device_irqs(); mutex_lock(&dpm_list_mtx); list_for_each_entry_reverse(dev, &dpm_list, power.entry) { - error = suspend_device_noirq(dev, state); + error = __device_suspend_noirq(dev, state); if (error) { pm_dev_err(dev, state, " late", error); break; @@ -627,10 +627,10 @@ int device_power_down(pm_message_t state) } mutex_unlock(&dpm_list_mtx); if (error) - device_power_up(resume_event(state)); + device_resume_noirq(resume_event(state)); return error; } -EXPORT_SYMBOL_GPL(device_power_down); +EXPORT_SYMBOL_GPL(device_suspend_noirq); /** * suspend_device - Save state of one device. diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index fddc2025dece..d5b327ac4039 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -43,7 +43,7 @@ static int xen_suspend(void *data) if (err) { printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", err); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); return err; } @@ -69,7 +69,7 @@ static int xen_suspend(void *data) } sysdev_resume(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); return 0; } @@ -101,9 +101,9 @@ static void do_suspend(void) printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = device_power_down(PMSG_SUSPEND); + err = device_suspend_noirq(PMSG_SUSPEND); if (err) { - printk(KERN_ERR "device_power_down failed: %d\n", err); + printk(KERN_ERR "device_suspend_noirq failed: %d\n", err); goto resume_devices; } @@ -119,7 +119,7 @@ static void do_suspend(void) } else xs_suspend_cancel(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); resume_devices: device_resume(PMSG_RESUME); diff --git a/include/linux/pm.h b/include/linux/pm.h index 1d4e2d289821..2170252074f3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -382,12 +382,12 @@ struct dev_pm_info { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); extern int sysdev_resume(void); -extern void device_power_up(pm_message_t state); +extern void device_resume_noirq(pm_message_t state); extern void device_resume(pm_message_t state); extern void device_pm_unlock(void); extern int sysdev_suspend(pm_message_t state); -extern int device_power_down(pm_message_t state); +extern int device_suspend_noirq(pm_message_t state); extern int device_suspend(pm_message_t state); extern int device_prepare_suspend(pm_message_t state); diff --git a/kernel/kexec.c b/kernel/kexec.c index e4983770913b..5a3da87adae0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1452,13 +1452,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, device_suspend() has been called, - * but *not* device_power_down(). We *must* - * device_power_down() now. Otherwise, drivers for + * but *not* device_suspend_noirq(). We *must* call + * device_suspend_noirq() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = device_suspend_noirq(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1486,7 +1486,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - device_power_up(PMSG_RESTORE); + device_resume_noirq(PMSG_RESTORE); Resume_devices: device_resume(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 5cb080e7eebd..1c18bc894a2d 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -216,12 +216,12 @@ static int create_image(int platform_mode) return error; /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* call device_power_down() now. + * device_suspend_noirq(). We *must* call device_suspend_noirq() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) * become desynchronized with the actual state of the hardware * at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = device_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -262,7 +262,7 @@ static int create_image(int platform_mode) Power_up: sysdev_resume(); - /* NOTE: device_power_up() is just a resume() for devices + /* NOTE: device_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -275,7 +275,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - device_power_up(in_suspend ? + device_resume_noirq(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode) { int error; - error = device_power_down(PMSG_QUIESCE); + error = device_suspend_noirq(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - device_power_up(PMSG_RECOVER); + device_resume_noirq(PMSG_RECOVER); return error; } @@ -454,7 +454,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = device_power_down(PMSG_HIBERNATE); + error = device_suspend_noirq(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -479,7 +479,7 @@ int hibernation_platform_enter(void) Platofrm_finish: hibernation_ops->finish(); - device_power_up(PMSG_RESTORE); + device_suspend_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 868028280d13..2f6638ee03c0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state) return error; } - error = device_power_down(PMSG_SUSPEND); + error = device_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platfrom_finish; @@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state) suspend_ops->wake(); Power_up_devices: - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); Platfrom_finish: if (suspend_ops->finish) -- cgit v1.2.3 From d161630297a20802d01c55847bfcba85d2118a9f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sun, 24 May 2009 22:05:42 +0200 Subject: PM core: rename suspend and resume functions This patch (as1241) renames a bunch of functions in the PM core. Rather than go through a boring list of name changes, suffice it to say that in the end we have a bunch of pairs of functions: device_resume_noirq dpm_resume_noirq device_resume dpm_resume device_complete dpm_complete device_suspend_noirq dpm_suspend_noirq device_suspend dpm_suspend device_prepare dpm_prepare in which device_X does the X operation on a single device and dpm_X invokes device_X for all devices in the dpm_list. In addition, the old dpm_power_up and device_resume_noirq have been combined into a single function (dpm_resume_noirq). Lastly, dpm_suspend_start and dpm_resume_end are the renamed versions of the former top-level device_suspend and device_resume routines. Signed-off-by: Alan Stern Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/apm_32.c | 14 ++++----- drivers/base/power/main.c | 80 ++++++++++++++++++++--------------------------- drivers/xen/manage.c | 16 +++++----- include/linux/pm.h | 11 +++---- kernel/kexec.c | 14 ++++----- kernel/power/disk.c | 30 +++++++++--------- kernel/power/main.c | 8 ++--- 7 files changed, 80 insertions(+), 93 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 31ae547da159..79302e9a33a4 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1233,9 +1233,9 @@ static int suspend(int vetoable) int err; struct apm_user *as; - device_suspend(PMSG_SUSPEND); + dpm_suspend_start(PMSG_SUSPEND); - device_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1259,9 +1259,9 @@ static int suspend(int vetoable) sysdev_resume(); local_irq_enable(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1277,7 @@ static void standby(void) { int err; - device_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void) sysdev_resume(); local_irq_enable(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); } static apm_event_t get_event(void) @@ -1376,7 +1376,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index c5a35bc9d63b..1f3d82260db4 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info, /*------------------------- Resume routines -------------------------*/ /** - * __device_resume_noirq - Power on one device (early resume). + * device_resume_noirq - Power on one device (early resume). * @dev: Device. * @state: PM transition of the system being carried out. * * Must be called with interrupts disabled. */ -static int __device_resume_noirq(struct device *dev, pm_message_t state) +static int device_resume_noirq(struct device *dev, pm_message_t state) { int error = 0; @@ -344,16 +344,16 @@ static int __device_resume_noirq(struct device *dev, pm_message_t state) } /** - * dpm_power_up - Power on all regular (non-sysdev) devices. + * dpm_resume_noirq - Power on all regular (non-sysdev) devices. * @state: PM transition of the system being carried out. * - * Execute the appropriate "noirq resume" callback for all devices marked - * as DPM_OFF_IRQ. + * Call the "noirq" resume handlers for all devices marked as + * DPM_OFF_IRQ and enable device drivers to receive interrupts. * * Must be called under dpm_list_mtx. Device drivers should not receive * interrupts while it's being executed. */ -static void dpm_power_up(pm_message_t state) +void dpm_resume_noirq(pm_message_t state) { struct device *dev; @@ -363,33 +363,21 @@ static void dpm_power_up(pm_message_t state) int error; dev->power.status = DPM_OFF; - error = __device_resume_noirq(dev, state); + error = device_resume_noirq(dev, state); if (error) pm_dev_err(dev, state, " early", error); } mutex_unlock(&dpm_list_mtx); -} - -/** - * device_resume_noirq - Turn on all devices that need special attention. - * @state: PM transition of the system being carried out. - * - * Call the "early" resume handlers and enable device drivers to receive - * interrupts. - */ -void device_resume_noirq(pm_message_t state) -{ - dpm_power_up(state); resume_device_irqs(); } -EXPORT_SYMBOL_GPL(device_resume_noirq); +EXPORT_SYMBOL_GPL(dpm_resume_noirq); /** - * resume_device - Restore state for one device. + * device_resume - Restore state for one device. * @dev: Device. * @state: PM transition of the system being carried out. */ -static int resume_device(struct device *dev, pm_message_t state) +static int device_resume(struct device *dev, pm_message_t state) { int error = 0; @@ -462,7 +450,7 @@ static void dpm_resume(pm_message_t state) dev->power.status = DPM_RESUMING; mutex_unlock(&dpm_list_mtx); - error = resume_device(dev, state); + error = device_resume(dev, state); mutex_lock(&dpm_list_mtx); if (error) @@ -480,11 +468,11 @@ static void dpm_resume(pm_message_t state) } /** - * complete_device - Complete a PM transition for given device + * device_complete - Complete a PM transition for given device * @dev: Device. * @state: PM transition of the system being carried out. */ -static void complete_device(struct device *dev, pm_message_t state) +static void device_complete(struct device *dev, pm_message_t state) { down(&dev->sem); @@ -527,7 +515,7 @@ static void dpm_complete(pm_message_t state) dev->power.status = DPM_ON; mutex_unlock(&dpm_list_mtx); - complete_device(dev, state); + device_complete(dev, state); mutex_lock(&dpm_list_mtx); } @@ -540,19 +528,19 @@ static void dpm_complete(pm_message_t state) } /** - * device_resume - Restore state of each device in system. + * dpm_resume_end - Restore state of each device in system. * @state: PM transition of the system being carried out. * * Resume all the devices, unlock them all, and allow new * devices to be registered once again. */ -void device_resume(pm_message_t state) +void dpm_resume_end(pm_message_t state) { might_sleep(); dpm_resume(state); dpm_complete(state); } -EXPORT_SYMBOL_GPL(device_resume); +EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ @@ -577,13 +565,13 @@ static pm_message_t resume_event(pm_message_t sleep_state) } /** - * __device_suspend_noirq - Shut down one device (late suspend). + * device_suspend_noirq - Shut down one device (late suspend). * @dev: Device. * @state: PM transition of the system being carried out. * * This is called with interrupts off and only a single CPU running. */ -static int __device_suspend_noirq(struct device *dev, pm_message_t state) +static int device_suspend_noirq(struct device *dev, pm_message_t state) { int error = 0; @@ -602,15 +590,15 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state) } /** - * device_suspend_noirq - Shut down special devices. + * dpm_suspend_noirq - Power down all regular (non-sysdev) devices. * @state: PM transition of the system being carried out. * - * Prevent device drivers from receiving interrupts and call the "late" + * Prevent device drivers from receiving interrupts and call the "noirq" * suspend handlers. * * Must be called under dpm_list_mtx. */ -int device_suspend_noirq(pm_message_t state) +int dpm_suspend_noirq(pm_message_t state) { struct device *dev; int error = 0; @@ -618,7 +606,7 @@ int device_suspend_noirq(pm_message_t state) suspend_device_irqs(); mutex_lock(&dpm_list_mtx); list_for_each_entry_reverse(dev, &dpm_list, power.entry) { - error = __device_suspend_noirq(dev, state); + error = device_suspend_noirq(dev, state); if (error) { pm_dev_err(dev, state, " late", error); break; @@ -627,17 +615,17 @@ int device_suspend_noirq(pm_message_t state) } mutex_unlock(&dpm_list_mtx); if (error) - device_resume_noirq(resume_event(state)); + dpm_resume_noirq(resume_event(state)); return error; } -EXPORT_SYMBOL_GPL(device_suspend_noirq); +EXPORT_SYMBOL_GPL(dpm_suspend_noirq); /** - * suspend_device - Save state of one device. + * device_suspend - Save state of one device. * @dev: Device. * @state: PM transition of the system being carried out. */ -static int suspend_device(struct device *dev, pm_message_t state) +static int device_suspend(struct device *dev, pm_message_t state) { int error = 0; @@ -704,7 +692,7 @@ static int dpm_suspend(pm_message_t state) get_device(dev); mutex_unlock(&dpm_list_mtx); - error = suspend_device(dev, state); + error = device_suspend(dev, state); mutex_lock(&dpm_list_mtx); if (error) { @@ -723,11 +711,11 @@ static int dpm_suspend(pm_message_t state) } /** - * prepare_device - Execute the ->prepare() callback(s) for given device. + * device_prepare - Execute the ->prepare() callback(s) for given device. * @dev: Device. * @state: PM transition of the system being carried out. */ -static int prepare_device(struct device *dev, pm_message_t state) +static int device_prepare(struct device *dev, pm_message_t state) { int error = 0; @@ -781,7 +769,7 @@ static int dpm_prepare(pm_message_t state) dev->power.status = DPM_PREPARING; mutex_unlock(&dpm_list_mtx); - error = prepare_device(dev, state); + error = device_prepare(dev, state); mutex_lock(&dpm_list_mtx); if (error) { @@ -807,12 +795,12 @@ static int dpm_prepare(pm_message_t state) } /** - * device_suspend - Save state and stop all devices in system. + * dpm_suspend_start - Save state and stop all devices in system. * @state: PM transition of the system being carried out. * * Prepare and suspend all devices. */ -int device_suspend(pm_message_t state) +int dpm_suspend_start(pm_message_t state) { int error; @@ -822,7 +810,7 @@ int device_suspend(pm_message_t state) error = dpm_suspend(state); return error; } -EXPORT_SYMBOL_GPL(device_suspend); +EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, void *fn, int ret) { diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index d5b327ac4039..10d03d7931c4 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -43,7 +43,7 @@ static int xen_suspend(void *data) if (err) { printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", err); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); return err; } @@ -69,7 +69,7 @@ static int xen_suspend(void *data) } sysdev_resume(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); return 0; } @@ -92,18 +92,18 @@ static void do_suspend(void) } #endif - err = device_suspend(PMSG_SUSPEND); + err = dpm_suspend_start(PMSG_SUSPEND); if (err) { - printk(KERN_ERR "xen suspend: device_suspend %d\n", err); + printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); goto out; } printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = device_suspend_noirq(PMSG_SUSPEND); + err = dpm_suspend_noirq(PMSG_SUSPEND); if (err) { - printk(KERN_ERR "device_suspend_noirq failed: %d\n", err); + printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); goto resume_devices; } @@ -119,10 +119,10 @@ static void do_suspend(void) } else xs_suspend_cancel(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); resume_devices: - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); /* Make sure timer events get retriggered on all CPUs */ clock_was_set(); diff --git a/include/linux/pm.h b/include/linux/pm.h index 2170252074f3..b3f74764a586 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -382,14 +382,13 @@ struct dev_pm_info { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); extern int sysdev_resume(void); -extern void device_resume_noirq(pm_message_t state); -extern void device_resume(pm_message_t state); +extern void dpm_resume_noirq(pm_message_t state); +extern void dpm_resume_end(pm_message_t state); extern void device_pm_unlock(void); extern int sysdev_suspend(pm_message_t state); -extern int device_suspend_noirq(pm_message_t state); -extern int device_suspend(pm_message_t state); -extern int device_prepare_suspend(pm_message_t state); +extern int dpm_suspend_noirq(pm_message_t state); +extern int dpm_suspend_start(pm_message_t state); extern void __suspend_report_result(const char *function, void *fn, int ret); @@ -403,7 +402,7 @@ extern void __suspend_report_result(const char *function, void *fn, int ret); #define device_pm_lock() do {} while (0) #define device_pm_unlock() do {} while (0) -static inline int device_suspend(pm_message_t state) +static inline int dpm_suspend_start(pm_message_t state) { return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a3da87adae0..ae1c35201cc8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1448,17 +1448,17 @@ int kernel_kexec(void) goto Restore_console; } suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; - /* At this point, device_suspend() has been called, - * but *not* device_suspend_noirq(). We *must* call - * device_suspend_noirq() now. Otherwise, drivers for + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_noirq(). We *must* call + * dpm_suspend_noirq() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = device_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1486,9 +1486,9 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - device_resume_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 1c18bc894a2d..a9beba68b6c7 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,13 +215,13 @@ static int create_image(int platform_mode) if (error) return error; - /* At this point, device_suspend() has been called, but *not* - * device_suspend_noirq(). We *must* call device_suspend_noirq() now. + /* At this point, dpm_suspend_start() has been called, but *not* + * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) * become desynchronized with the actual state of the hardware * at resume time, and evil weirdness ensues. */ - error = device_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -262,7 +262,7 @@ static int create_image(int platform_mode) Power_up: sysdev_resume(); - /* NOTE: device_resume_noirq() is just a resume() for devices + /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -275,7 +275,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - device_resume_noirq(in_suspend ? + dpm_resume_noirq(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: - device_resume(in_suspend ? + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); Close: @@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode) { int error; - error = device_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_noirq(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - device_resume_noirq(PMSG_RECOVER); + dpm_resume_noirq(PMSG_RECOVER); return error; } @@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_QUIESCE); + error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - device_resume(PMSG_RECOVER); + dpm_resume_end(PMSG_RECOVER); } resume_console(); pm_restore_console(); @@ -447,14 +447,14 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - error = device_suspend(PMSG_HIBERNATE); + error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } - error = device_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_noirq(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -479,11 +479,11 @@ int hibernation_platform_enter(void) Platofrm_finish: hibernation_ops->finish(); - device_suspend_noirq(PMSG_RESTORE); + dpm_suspend_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 2f6638ee03c0..46386b9f8dd1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state) return error; } - error = device_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platfrom_finish; @@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state) suspend_ops->wake(); Power_up_devices: - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); Platfrom_finish: if (suspend_ops->finish) @@ -363,7 +363,7 @@ int suspend_devices_and_enter(suspend_state_t state) } suspend_console(); suspend_test_start(); - error = device_suspend(PMSG_SUSPEND); + error = dpm_suspend_start(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); goto Recover_platform; @@ -376,7 +376,7 @@ int suspend_devices_and_enter(suspend_state_t state) Resume_devices: suspend_test_start(); - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); resume_console(); Close: -- cgit v1.2.3