From bf80bbd7dcf525e41e0673fbaa8cd21d2344b460 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 23 Mar 2015 10:42:52 -0500 Subject: x86/mce: Add an AMD severities-grading function Add a severities function that caters to AMD processors. This allows us to do some vendor-specific work within the function if necessary. Also, introduce a vendor flag bitfield for vendor-specific settings. The severities code uses this to define error scope based on the prescence of the flags field. This is based off of work by Boris Petkov. Testing details: Fam10h, Model 9h (Greyhound) Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo), Fam16h Model 00h-0fh (Kabini) Boris: Intel SNB AMD K8 (JH-E0) Signed-off-by: Aravind Gopalakrishnan Acked-by: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Chen Yucong Cc: Andy Lutomirski Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com [ Fixup build, clean up comments. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 56 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce.c | 9 +++++ 2 files changed, 65 insertions(+) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb433043a7f..e16f3f201e06 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -186,12 +186,68 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, enum context ctx) +{ + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) + return MCE_PANIC_SEVERITY; + + if (m->status & MCI_STATUS_UC) { + + /* + * On older systems where overflow_recov flag is not present, we + * should simply panic if an error overflow occurs. If + * overflow_recov flag is present and set, then software can try + * to at least kill process to prolong system operation. + */ + if (mce_flags.overflow_recov) { + /* software can try to contain */ + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + + /* kill current process */ + return MCE_AR_SEVERITY; + } else { + /* at least one error was not logged */ + if (m->status & MCI_STATUS_OVER) + return MCE_PANIC_SEVERITY; + } + + /* + * For any other case, return MCE_UC_SEVERITY so that we log the + * error and exit #MC handler. + */ + return MCE_UC_SEVERITY; + } + + /* + * deferred error: poll handler catches these and adds to mce_ring so + * memory-failure can take recovery actions. + */ + if (m->status & MCI_STATUS_DEFERRED) + return MCE_DEFERRED_SEVERITY; + + /* + * corrected error: poll handler catches these and passes responsibility + * of decoding the error to EDAC + */ + return MCE_KEEP_SEVERITY; +} + int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; + if (m->cpuvendor == X86_VENDOR_AMD) + return mce_severity_amd(m, ctx); + for (s = severities;; s++) { if ((m->status & s->mask) != s->result) continue; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8548b714a16b..1189f1150a19 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; +struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { .bootlog = -1, @@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + /* * Turn off MC4_MISC thresholding banks on those models since * they're not supported there. @@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) break; case X86_VENDOR_AMD: mce_amd_feature_init(c); + mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; break; default: break; -- cgit v1.2.3 From 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 23 Mar 2015 10:42:53 -0500 Subject: x86/mce: Define mce_severity function pointer Rename mce_severity() to mce_severity_intel() and assign the mce_severity function pointer to mce_severity_amd() during init on AMD. This way, we can avoid a test to call mce_severity_amd every time we get into mce_severity(). And it's cleaner to do it this way. Signed-off-by: Aravind Gopalakrishnan Suggested-by: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Chen Yucong Cc: Andy Lutomirski Cc: linux-edac Link: http://lkml.kernel.org/r/1427125373-2918-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 2 +- arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 ++++++++++++++----- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b574fbf62d39..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -134,9 +134,11 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_vendor_init_severity(void) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index e12f0bfb45c1..fe32074b865b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -24,7 +24,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index e16f3f201e06..155c9261d3ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -190,8 +190,10 @@ static int error_context(struct mce *m) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, enum context ctx) +static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum context ctx = error_context(m); + /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) return MCE_PANIC_SEVERITY; @@ -239,15 +241,12 @@ static int mce_severity_amd(struct mce *m, enum context ctx) return MCE_KEEP_SEVERITY; } -int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; - if (m->cpuvendor == X86_VENDOR_AMD) - return mce_severity_amd(m, ctx); - for (s = severities;; s++) { if ((m->status & s->mask) != s->result) continue; @@ -272,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) } } +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = + mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + mce_severity = mce_severity_amd; +} + #ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1189f1150a19..c7df30748629 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2026,6 +2026,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mcheck_vendor_init_severity(); return 0; } -- cgit v1.2.3