From 7559e13fb4abe7880dfaf985d6a1630ca90a67ce Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:55 -0500 Subject: x86/mce: Add support for deferred errors on AMD Deferred errors indicate error conditions that were not corrected, but those errors have not been consumed yet. They require no action from S/W (or action is optional). These errors provide info about a latent uncorrectable MCE that can occur when a poisoned data is consumed by the processor. Newer AMD processors can generate deferred errors and can be configured to generate APIC interrupts on such events. SUCCOR stands for S/W UnCorrectable error COntainment and Recovery. It indicates support for data poisoning in HW and deferred error interrupts. Add new bitfield to mce_vendor_flags for this. We use this to verify presence of deferred error interrupts before we enable them in mce_amd.c While at it, clarify comments in mce_vendor_flags to provide an indication of usages of the bitfields. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-4-git-send-email-Aravind.Gopalakrishnan@amd.com [ beef up commit message, do CPUID(8000_0007) only once. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..521e5016aca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1637,10 +1637,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } -- cgit v1.2.3 From 5c31b2800d8d3e735e5ecac8fc13d1cf862fd330 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 26 May 2015 10:28:21 +0200 Subject: x86/mce: Fix monarch timeout setting through the mce= cmdline option Using "mce=1,10000000" on the kernel cmdline to change the monarch timeout does not work. The cause is that get_option() does parse a subsequent comma in the option string and signals that with a return value. So we don't need to check for a second comma ourselves. Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1432120943-25028-1-git-send-email-xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1432628901-18044-19-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 521e5016aca6..0cbcd3183acf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2014,11 +2014,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; -- cgit v1.2.3 From 88d538672ea26223bca08225bc49f4e65e71683d Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:23 +0200 Subject: x86/mce: Add infrastructure to support Local MCE Initialize and prepare for handling LMCEs. Add a boot-time option to disable LMCEs. Signed-off-by: Ashok Raj [ Simplify stuff, align statements for better readability, reflow comments; kill unused lmce_clear(); save us an MSR write if LMCE is already enabled. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-16-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 3 +++ arch/x86/include/asm/mce.h | 5 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 43 +++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+) (limited to 'arch/x86/kernel/cpu/mcheck/mce.c') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 5223479291a2..68ed3114c363 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -31,6 +31,9 @@ Machine check (e.g. BIOS or hardware monitoring applications), conflicting with OS's error handling, and you cannot deactivate the agent, then this option will be a help. + mce=no_lmce + Do not opt-in to Local MCE delivery. Use legacy method + to broadcast MCEs. mce=bootlog Enable logging of machine checks left over from booting. Disabled by default on AMD because some BIOS leave bogus ones. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ae2bfb895994..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -109,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -184,12 +185,16 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); +void lmce_clear(void); +void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} +static inline void lmce_clear(void) {} +static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0cbcd3183acf..c8c6577b4ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1982,6 +1982,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -2005,6 +2006,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..2d872deb2c50 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,6 +435,19 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); -- cgit v1.2.3 From 243d657eaf540db882f73497060da5a4f7d86a90 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:24 +0200 Subject: x86/mce: Handle Local MCE events Add the necessary changes to do_machine_check() to be able to process MCEs signaled as local MCEs. Typically, only recoverable errors (SRAR type) will be Signaled as LMCE. The architecture does not restrict to only those errors, however. When errors are signaled as LMCE, there is no need for the MCE handler to perform rendezvous with other logical processors unlike earlier processors that would broadcast machine check errors. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c8c6577b4ada..ddc46d67d93e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1047,6 +1047,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1074,11 +1075,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1155,8 +1165,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2d872deb2c50..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -452,4 +452,5 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } -- cgit v1.2.3