From 85c9306d44f757d2fb3b0e3e399080a025315c7f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:38 +0200 Subject: x86/ras/mce_amd_inj: Return early on invalid input Invalid inputs such as these are currently reported in dmesg as failing: $> echo sweet > flags [ 122.079139] flags_write: Invalid flags value: et even though the 'flags' attribute has been updated correctly: $> cat flags sw This is because userspace keeps writing the remaining buffer until it encounters an error. However, the input as a whole is wrong and we should not be writing anything to the file. Therefore, correct flags_write() to return -EINVAL immediately on bad input strings. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-2-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/mce_amd_inj.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/ras/mce_amd_inj.c') diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 17e35b5bf779..4fd8bb9b90b9 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -129,12 +129,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, { char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - size_t ret; if (cnt > MAX_FLAG_OPT_SIZE) - cnt = MAX_FLAG_OPT_SIZE; - - ret = cnt; + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -150,9 +147,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, return err; } - *ppos += ret; + *ppos += cnt; - return ret; + return cnt; } static const struct file_operations flags_fops = { -- cgit v1.2.3 From a1300e50529795cd605da6a015d4944a18921db0 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:39 +0200 Subject: x86/ras/mce_amd_inj: Trigger deferred and thresholding errors interrupts Add the capability to trigger deferred error interrupts and threshold interrupts in order to test the APIC interrupt handler functionality for these type of errors. Update README section about the same too. Reported by: kbuild test robot Signed-off-by: Aravind Gopalakrishnan [ Cleanup comments. ] [ Include asm/irq_vectors.h directly so that misc randbuilds don't fail. ] Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-3-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/mce_amd_inj.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'arch/x86/ras/mce_amd_inj.c') diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 4fd8bb9b90b9..4d3bafb540c2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -17,7 +17,9 @@ #include #include #include + #include +#include #include "../kernel/cpu/mcheck/mce-internal.h" @@ -34,12 +36,16 @@ static u8 n_banks; enum injection_type { SW_INJ = 0, /* SW injection, simply decode the error */ HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ N_INJ_TYPES, }; static const char * const flags_options[] = { [SW_INJ] = "sw", [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", NULL }; @@ -182,6 +188,16 @@ static void trigger_mce(void *info) asm volatile("int $18"); } +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + static void do_inject(void) { u64 mcg_status = 0; @@ -202,6 +218,16 @@ static void do_inject(void) if (!(i_mce.status & MCI_STATUS_PCC)) mcg_status |= MCG_STATUS_RIPV; + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } + get_online_cpus(); if (!cpu_online(cpu)) goto err; @@ -222,7 +248,16 @@ static void do_inject(void) toggle_hw_mce_inject(cpu, false); - smp_call_function_single(cpu, trigger_mce, NULL, 0); + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } err: put_online_cpus(); @@ -287,6 +322,11 @@ static const char readme_msg[] = "\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" "\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" "\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" "\n"; static ssize_t -- cgit v1.2.3 From fa20a2ed6fff717839ec03b6574ea0affcb58841 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:40 +0200 Subject: x86/ras/mce_amd_inj: Inject bank 4 errors on the NBC Bank 4 MCEs are logged and reported only on the node base core (NBC) in a socket. Refer to the D18F3x44[NbMcaToMstCpuEn] field in Fam10h and later BKDGs. The node base core (NBC) is the lowest numbered core in the node. This patch ensures that we inject the error on the NBC for bank 4 errors. Otherwise, triggering #MC or APIC interrupts on a core which is not the NBC would not have any effect on the system, i.e. we would not see any relevant output on kernel logs for the error we just injected. Signed-off-by: Aravind Gopalakrishnan [ Cleanup comments. ] [ Add a missing dependency on AMD_NB caught by Randy Dunlap. ] Signed-off-by: Borislav Petkov Acked-by: Randy Dunlap Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-4-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/Kconfig | 4 +--- arch/x86/ras/mce_amd_inj.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) (limited to 'arch/x86/ras/mce_amd_inj.c') diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 10fea5fc821e..df280da34825 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,11 +1,9 @@ config AMD_MCE_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS + depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different aspects of the MCE handling code. WARNING: Do not even assume this interface is staying stable! - - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 4d3bafb540c2..55d38cfa46c2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -17,8 +17,10 @@ #include #include #include +#include #include +#include #include #include "../kernel/cpu/mcheck/mce-internal.h" @@ -32,6 +34,7 @@ static struct dentry *dfs_inj; static u8 n_banks; #define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 enum injection_type { SW_INJ = 0, /* SW injection, simply decode the error */ @@ -198,6 +201,45 @@ static void trigger_thr_int(void *info) asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); } +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct pci_dev *F3 = node_to_amd_nb(nid)->misc; + u32 val; + int err; + + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + static void do_inject(void) { u64 mcg_status = 0; @@ -228,6 +270,16 @@ static void do_inject(void) i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); } + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + get_online_cpus(); if (!cpu_online(cpu)) goto err; -- cgit v1.2.3