From 853d9b18f1e861d37e9b271742329f8c1176eabe Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Fri, 29 Nov 2013 21:28:48 +0100 Subject: x86, mce: Call put_device on device_register failure This patch adds a call to put_device() when the device_register() call has failed. This is required so that the last reference to the device is given up. Signed-off-by: Levente Kurusa Link: http://lkml.kernel.org/r/5298F900.9000208@linux.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cdee95f..a389c1d859ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu) dev->release = &mce_device_release; err = device_register(dev); - if (err) + if (err) { + put_device(dev); return err; + } for (i = 0; mce_device_attrs[i]; i++) { err = device_create_file(dev, mce_device_attrs[i]); -- cgit v1.2.3 From addccbb264e5e0e5762f4893f6df24afad327c8c Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Mon, 25 Nov 2013 02:15:00 -0500 Subject: ACPI, APEI, GHES: Do not report only correctable errors with SCI Currently SCI is employed to handle corrected errors - memory corrected errors, more specifically but in fact SCI still can be used to handle any errors, e.g. uncorrected or even fatal ones if enabled by the BIOS. Enable logging for those kinds of errors too. Signed-off-by: Chen, Gong Acked-by: Naveen N. Rao Cc: Tony Luck Link: http://lkml.kernel.org/r/1385363701-12387-1-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message, rename function arg. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 14 ++++++++++---- drivers/acpi/apei/ghes.c | 3 +-- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index de8b60a53f69..a1aef9533154 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -33,22 +33,28 @@ #include #include #include +#include #include #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; - /* Only corrected MC is reported */ - if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); m.bank = 1; - /* Fake a memory read corrected error with unknown channel */ + /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + m.addr = mem_err->physical_addr; mce_log(&m); mce_notify_irq(); diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a30bc313787b..ce3683d93a13 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -453,8 +453,7 @@ static void ghes_do_proc(struct ghes *ghes, ghes_edac_report_mem_error(ghes, sev, mem_err); #ifdef CONFIG_X86_MCE - apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, - mem_err); + apei_mce_report_mem_error(sev, mem_err); #endif ghes_handle_memory_failure(gdata, sev); } -- cgit v1.2.3 From 4f75d8412792777a314ac5c1393a9ed43d695fd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Dec 2013 18:05:02 +0100 Subject: x86, mce: Fix mce_start_timer semantics So mce_start_timer() has a 'cpu' argument which is supposed to mean to start a timer on that cpu. However, the code currently starts a timer on the *current* cpu the function runs on and causes the sanity-check in mce_timer_fn to fire: WARNING: CPU: 0 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:1286 mce_timer_fn because it is running on the wrong cpu. This was triggered by Prarit Bhargava by offlining all the cpus in succession. Then, we were fiddling with the CMCI storm settings when starting the timer whereas there's no need for that - if there's storm happening on this newly restarted cpu, we're going to be in normal CMCI mode initially and then when the CMCI interrupt starts firing, we're going to go to the polling mode with the timer real soon. Signed-off-by: Borislav Petkov Tested-by: Prarit Bhargava Cc: Tony Luck Reviewed-by: Chen, Gong Link: http://lkml.kernel.org/r/1387722156-5511-1-git-send-email-prarit@redhat.com --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a389c1d859ec..4d5419b249da 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - unsigned long iv = mce_adjust_timer(check_interval * HZ); - - __this_cpu_write(mce_next_interval, iv); + unsigned long iv = check_interval * HZ; if (mca_cfg.ignore_ce || !iv) return; + per_cpu(mce_next_interval, cpu) = iv; + t->expires = round_jiffies(jiffies + iv); - add_timer_on(t, smp_processor_id()); + add_timer_on(t, cpu); } static void __mcheck_cpu_init_timer(void) -- cgit v1.2.3