From 473e90b2e8356f084dcf9c815a5170d4d4925897 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 19 May 2017 11:39:13 +0200 Subject: x86/mce: Convert threshold_bank.cpus from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Borislav Petkov Reviewed-by: Hans Liljestrand Reviewed-by: David Windsor Cc: Tony Luck Cc: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1492695536-5947-1-git-send-email-elena.reshetova@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/amd_nb.h | 3 ++- arch/x86/kernel/cpu/mcheck/mce_amd.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00c88a01301d..da181ad1d5f8 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,6 +3,7 @@ #include #include +#include struct amd_nb_bus_dev_range { u8 bus; @@ -55,7 +56,7 @@ struct threshold_bank { struct threshold_block *blocks; /* initialized to the number of CPUs on the node sharing this bank */ - atomic_t cpus; + refcount_t cpus; }; struct amd_northbridge { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 6e4a047e4b68..41439ab41102 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1202,7 +1202,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; per_cpu(threshold_banks, cpu)[bank] = b; - atomic_inc(&b->cpus); + refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1225,7 +1225,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; if (is_shared_bank(bank)) { - atomic_set(&b->cpus, 1); + refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ if (nb) { @@ -1289,7 +1289,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) goto free_out; if (is_shared_bank(bank)) { - if (!atomic_dec_and_test(&b->cpus)) { + if (!refcount_dec_and_test(&b->cpus)) { __threshold_remove_blocks(b); per_cpu(threshold_banks, cpu)[bank] = NULL; return; -- cgit v1.2.3 From 37d43acfd79f9c53289e9990c344cbd5b4db4bd4 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 19 May 2017 11:39:14 +0200 Subject: x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux. So we've used these registers in place of MCA_{STATUS,ADDR} on SMCA systems. However, the guidance for current SMCA implementations of is to continue using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if a Deferred error was not found in the former registers. If we logged a Deferred error in MCA_STATUS then we should also clear MCA_DESTAT. This also means we shouldn't clear MCA_CONFIG[LogDeferredInMcaStat]. Rework __log_error() to only log an error and add helpers for the different error types being logged from the corresponding interrupt handlers. Boris: carve out common functionality into a _log_error_bank(). Cleanup comments, check MCi_STATUS bits before reading MSRs. Streamline flow. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1493147772-2721-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 147 ++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 73 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 41439ab41102..c511fa38ef4e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -471,20 +471,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high |= BIT(0); - /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - /* * SMCA sets the Deferred Error Interrupt type per bank. * @@ -755,37 +741,19 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); -static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; + m.misc = misc; m.bank = bank; m.tsc = rdtsc(); - if (threshold_err) - m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:] where lsb is the least significant @@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -832,45 +798,85 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) exiting_ack_irq(); } -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - unsigned int bank; - u32 msr_status; - u64 status; + u64 status, addr = 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; - rdmsrl(msr_status, status); + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; + __log_error(bank, status, addr, misc); - __log_error(bank, true, false, 0); - break; - } + wrmsrl(status, 0); + + return status & MCI_STATUS_DEFERRED; } /* - * APIC Interrupt Handler + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; + + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); + + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} /* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. */ - static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); struct thresh_restart tr; - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -893,23 +899,18 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } - } - return; + if (!(high & MASK_OVERFLOW_HI)) + continue; -log: - __log_error(bank, false, true, ((u64)high << 32) | low); + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(bank, ((u64)high << 32) | low); - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; - threshold_restart_bank(&tr); + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); + } + } } /* -- cgit v1.2.3 From 84bcc1d57f634ba8a55eda9a910c159467af0aac Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 19 May 2017 11:39:15 +0200 Subject: x86/mce/AMD: Carve out SMCA bank configuration Scalable MCA systems have a new MCA_CONFIG register that we use to configure each bank. We currently use this when we set up thresholding. However, this is logically separate. Group all SMCA-related initialization into a single function. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1493147772-2721-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 76 ++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index c511fa38ef4e..d00f299f2ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void get_smca_bank_info(unsigned int bank) +static void smca_configure(unsigned int bank, unsigned int cpu) { - unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + unsigned int i, hwid_mcatype; struct smca_hwid *s_hwid; - u32 high, instance_id; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank) smca_get_name(s_hwid->bank_type)); smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = instance_id; + smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; break; } @@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high, smca_addr; + u32 smca_low, smca_high; struct threshold_block b; int new; @@ -457,37 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, goto set_offset; } - smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - smca_high |= BIT(0); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) - smca_high |= BIT(5); - - wrmsr(smca_addr, smca_low, smca_high); - } - /* Gather LVT offset for thresholding: */ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) goto out; @@ -516,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) - get_smca_bank_info(bank); + smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); -- cgit v1.2.3 From a24b8c3409935bbc8e3c12131c473c92692403cd Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2017 18:28:28 +0200 Subject: x86/mce/AMD: Use msr_stat when clearing MCA_STATUS The value of MCA_STATUS is used as the MSR when clearing MCA_STATUS. This may cause the following warning: unchecked MSR access error: WRMSR to 0x11b (tried to write 0x0000000000000000) Call Trace: smp_threshold_interrupt() threshold_interrupt() Use msr_stat instead which has the MSR address. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Fixes: 37d43acfd79f ("x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers") Link: http://lkml.kernel.org/r/20170613162835.30750-2-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index d00f299f2ada..d11f94e8e68a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -815,7 +815,7 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) __log_error(bank, status, addr, misc); - wrmsrl(status, 0); + wrmsrl(msr_stat, 0); return status & MCI_STATUS_DEFERRED; } -- cgit v1.2.3 From 17ef4af0ec0f97b369f304dc04d61722f3591c4b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2017 18:28:29 +0200 Subject: x86/mce/AMD: Use saved threshold block info in interrupt handler In the amd_threshold_interrupt() handler, we loop through every possible block in each bank and rediscover the block's address and if it's valid, e.g. valid, counter present and not locked. However, we already have the address saved in the threshold blocks list for each CPU and bank. The list only contains blocks that have passed all the valid checks. Besides the redundancy, there's also a smp_call_function* in get_block_address() which causes a warning when servicing the interrupt: WARNING: CPU: 0 PID: 0 at kernel/smp.c:281 smp_call_function_single+0xdd/0xf0 ... Call Trace: rdmsr_safe_on_cpu() get_block_address.isra.2() amd_threshold_interrupt() smp_threshold_interrupt() threshold_interrupt() because we do get called in an interrupt handler *with* interrupts disabled, which can result in a deadlock. Drop the redundant valid checks and move the overflow check, logging and block reset into a separate function. Check the first block then iterate over the rest. This procedure is needed since the first block is used as the head of the list. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170613162835.30750-3-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 66 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index d11f94e8e68a..9e314bcf67cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -867,49 +867,53 @@ static void log_error_thresholding(unsigned int bank, u64 misc) _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); } +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; + + if (!block) + return; + + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; + + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} + /* * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt * goes off when error_count reaches threshold_limit. */ static void amd_threshold_interrupt(void) { - u32 low = 0, high = 0, address = 0; - unsigned int bank, block, cpu = smp_processor_id(); - struct thresh_restart tr; + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + unsigned int bank, cpu = smp_processor_id(); for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; - for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); - if (!address) - break; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - if (!(high & MASK_OVERFLOW_HI)) - continue; - /* Log the MCE which caused the threshold event. */ - log_error_thresholding(bank, ((u64)high << 32) | low); + first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + if (!first_block) + continue; - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; - threshold_restart_bank(&tr); - } + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); } } -- cgit v1.2.3 From bc8e80d56c1ecb35e65df392d7601d1427d14efe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jun 2017 18:28:30 +0200 Subject: x86/mce: Merge mce_amd_inj into mce-inject Reuse mce_amd_inj's debugfs interface so that mce-inject can benefit from it too. The old functionality is still preserved under CONFIG_X86_MCELOG_LEGACY. Tested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/20170613162835.30750-4-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/Makefile | 2 - arch/x86/include/asm/processor.h | 5 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 542 +++++++++++++++++++++++++++++++- arch/x86/ras/Kconfig | 11 - arch/x86/ras/Makefile | 2 - arch/x86/ras/mce_amd_inj.c | 492 ----------------------------- 7 files changed, 532 insertions(+), 524 deletions(-) delete mode 100644 arch/x86/ras/Makefile delete mode 100644 arch/x86/ras/mce_amd_inj.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0efb4c9497bc..4371b6b5cbe4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1082,7 +1082,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY + depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index bf240b920473..ad2db82e9953 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -257,8 +257,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/ drivers-$(CONFIG_FB) += arch/x86/video/ -drivers-$(CONFIG_RAS) += arch/x86/ras/ - #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..71f6fba95aa6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -901,8 +901,13 @@ static inline int mpx_disable_management(void) } #endif /* CONFIG_X86_INTEL_MPX */ +#ifdef CONFIG_CPU_SUP_AMD extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); +#else +static inline u16 amd_get_nb_id(int cpu) { return 0; } +static inline u32 amd_get_nodes_per_socket(void) { return 0; } +#endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 99165b206df3..7170186938e5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -10,23 +10,108 @@ * Authors: * Andi Kleen * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-17: Borislav Petkov + * Advanced Micro Devices Inc. */ -#include -#include -#include -#include -#include + +#include +#include +#include #include -#include -#include -#include +#include #include -#include +#include +#include +#include +#include +#include +#include #include -#include -#include +#include +#include +#include +#include + +#include #include +#include +#include #include +#include + +#include "mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -143,7 +228,7 @@ static int raise_local(void) return ret; } -static void raise_mce(struct mce *m) +static void __maybe_unused raise_mce(struct mce *m) { int context = MCJ_CTX(m->inject_flags); @@ -198,6 +283,7 @@ static void raise_mce(struct mce *m) } } +#ifdef CONFIG_X86_MCELOG_LEGACY /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) @@ -232,21 +318,445 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, mutex_unlock(&mce_inject_mutex); return usize; } +#endif /* CONFIG_X86_MCELOG_LEGACY */ + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} -static int inject_init(void) +static void prepare_msrs(void *info) { + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + rdtscll(i_mce.tsc); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } + + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init debugfs_init(void) +{ + unsigned int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (i-- > 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENODEV; +} + +static int __init inject_init(void) +{ + int err; + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + +#ifdef CONFIG_X86_MCELOG_LEGACY + register_mce_write_callback(mce_write); +#endif + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + + err = debugfs_init(); + if (err) { + free_cpumask_var(mce_inject_cpumask); + return err; + } + pr_info("Machine check injector initialized\n"); - register_mce_write_callback(mce_write); - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, - "mce_notify"); + return 0; } module_init(inject_init); + /* * Cannot tolerate unloading currently because we cannot * guarantee all openers of mce_chrdev will get a reference to us. */ +#ifndef CONFIG_X86_MCELOG_LEGACY +static void __exit inject_exit(void) +{ + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + free_cpumask_var(mce_inject_cpumask); +} + +module_exit(inject_exit); +#endif + MODULE_LICENSE("GPL"); diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 2a2d89d39af6..bb026699ad19 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,13 +1,3 @@ -config MCE_AMD_INJ - tristate "Simple MCE injection interface for AMD processors" - depends on RAS && X86_MCE && DEBUG_FS && AMD_NB - default n - help - This is a simple debugfs interface to inject MCEs and test different - aspects of the MCE handling code. - - WARNING: Do not even assume this interface is staying stable! - config RAS_CEC bool "Correctable Errors Collector" depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS @@ -20,4 +10,3 @@ config RAS_CEC Bear in mind that this is absolutely useless if your platform doesn't have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. - diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile deleted file mode 100644 index 5f94546db280..000000000000 --- a/arch/x86/ras/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c deleted file mode 100644 index 8730c2882fff..000000000000 --- a/arch/x86/ras/mce_amd_inj.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * A simple MCE injection facility for testing different aspects of the RAS - * code. This driver should be built as module so that it can be loaded - * on production kernels for testing purposes. - * - * This file may be distributed under the terms of the GNU General Public - * License version 2. - * - * Copyright (c) 2010-15: Borislav Petkov - * Advanced Micro Devices Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "../kernel/cpu/mcheck/mce-internal.h" - -/* - * Collect all the MCi_XXX settings - */ -static struct mce i_mce; -static struct dentry *dfs_inj; - -static u8 n_banks; - -#define MAX_FLAG_OPT_SIZE 3 -#define NBCFG 0x44 - -enum injection_type { - SW_INJ = 0, /* SW injection, simply decode the error */ - HW_INJ, /* Trigger a #MC */ - DFR_INT_INJ, /* Trigger Deferred error interrupt */ - THR_INT_INJ, /* Trigger threshold interrupt */ - N_INJ_TYPES, -}; - -static const char * const flags_options[] = { - [SW_INJ] = "sw", - [HW_INJ] = "hw", - [DFR_INT_INJ] = "df", - [THR_INT_INJ] = "th", - NULL -}; - -/* Set default injection to SW_INJ */ -static enum injection_type inj_type = SW_INJ; - -#define MCE_INJECT_SET(reg) \ -static int inj_##reg##_set(void *data, u64 val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - m->reg = val; \ - return 0; \ -} - -MCE_INJECT_SET(status); -MCE_INJECT_SET(misc); -MCE_INJECT_SET(addr); -MCE_INJECT_SET(synd); - -#define MCE_INJECT_GET(reg) \ -static int inj_##reg##_get(void *data, u64 *val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - *val = m->reg; \ - return 0; \ -} - -MCE_INJECT_GET(status); -MCE_INJECT_GET(misc); -MCE_INJECT_GET(addr); -MCE_INJECT_GET(synd); - -DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); - -/* - * Caller needs to be make sure this cpu doesn't disappear - * from under us, i.e.: get_cpu/put_cpu. - */ -static int toggle_hw_mce_inject(unsigned int cpu, bool enable) -{ - u32 l, h; - int err; - - err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); - if (err) { - pr_err("%s: error reading HWCR\n", __func__); - return err; - } - - enable ? (l |= BIT(18)) : (l &= ~BIT(18)); - - err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); - if (err) - pr_err("%s: error writing HWCR\n", __func__); - - return err; -} - -static int __set_inj(const char *buf) -{ - int i; - - for (i = 0; i < N_INJ_TYPES; i++) { - if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { - inj_type = i; - return 0; - } - } - return -EINVAL; -} - -static ssize_t flags_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE]; - int n; - - n = sprintf(buf, "%s\n", flags_options[inj_type]); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); -} - -static ssize_t flags_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE], *__buf; - int err; - - if (cnt > MAX_FLAG_OPT_SIZE) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt - 1] = 0; - - /* strip whitespace */ - __buf = strstrip(buf); - - err = __set_inj(__buf); - if (err) { - pr_err("%s: Invalid flags value: %s\n", __func__, __buf); - return err; - } - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations flags_fops = { - .read = flags_read, - .write = flags_write, - .llseek = generic_file_llseek, -}; - -/* - * On which CPU to inject? - */ -MCE_INJECT_GET(extcpu); - -static int inj_extcpu_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= nr_cpu_ids || !cpu_online(val)) { - pr_err("%s: Invalid CPU: %llu\n", __func__, val); - return -EINVAL; - } - m->extcpu = val; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); - -static void trigger_mce(void *info) -{ - asm volatile("int $18"); -} - -static void trigger_dfr_int(void *info) -{ - asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); -} - -static void trigger_thr_int(void *info) -{ - asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); -} - -static u32 get_nbc_for_node(int node_id) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 cores_per_node; - - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - - return cores_per_node * node_id; -} - -static void toggle_nb_mca_mst_cpu(u16 nid) -{ - struct pci_dev *F3 = node_to_amd_nb(nid)->misc; - u32 val; - int err; - - if (!F3) - return; - - err = pci_read_config_dword(F3, NBCFG, &val); - if (err) { - pr_err("%s: Error reading F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); - return; - } - - if (val & BIT(27)) - return; - - pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", - __func__); - - val |= BIT(27); - err = pci_write_config_dword(F3, NBCFG, val); - if (err) - pr_err("%s: Error writing F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); -} - -static void prepare_msrs(void *info) -{ - struct mce m = *(struct mce *)info; - u8 b = m.bank; - - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); - } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); - } - - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); - } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); - } -} - -static void do_inject(void) -{ - u64 mcg_status = 0; - unsigned int cpu = i_mce.extcpu; - u8 b = i_mce.bank; - - rdtscll(i_mce.tsc); - - if (i_mce.misc) - i_mce.status |= MCI_STATUS_MISCV; - - if (i_mce.synd) - i_mce.status |= MCI_STATUS_SYNDV; - - if (inj_type == SW_INJ) { - mce_inject_log(&i_mce); - return; - } - - /* prep MCE global settings for the injection */ - mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; - - if (!(i_mce.status & MCI_STATUS_PCC)) - mcg_status |= MCG_STATUS_RIPV; - - /* - * Ensure necessary status bits for deferred errors: - * - MCx_STATUS[Deferred]: make sure it is a deferred error - * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC - */ - if (inj_type == DFR_INT_INJ) { - i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); - } - - /* - * For multi node CPUs, logging and reporting of bank 4 errors happens - * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for - * Fam10h and later BKDGs. - */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && - b == 4 && - boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); - } - - get_online_cpus(); - if (!cpu_online(cpu)) - goto err; - - toggle_hw_mce_inject(cpu, true); - - i_mce.mcgstatus = mcg_status; - i_mce.inject_flags = inj_type; - smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); - - toggle_hw_mce_inject(cpu, false); - - switch (inj_type) { - case DFR_INT_INJ: - smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); - break; - case THR_INT_INJ: - smp_call_function_single(cpu, trigger_thr_int, NULL, 0); - break; - default: - smp_call_function_single(cpu, trigger_mce, NULL, 0); - } - -err: - put_online_cpus(); - -} - -/* - * This denotes into which bank we're injecting and triggers - * the injection, at the same time. - */ -static int inj_bank_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); - return -EINVAL; - } - - m->bank = val; - do_inject(); - - return 0; -} - -MCE_INJECT_GET(bank); - -DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); - -static const char readme_msg[] = -"Description of the files and their usages:\n" -"\n" -"Note1: i refers to the bank number below.\n" -"Note2: See respective BKDGs for the exact bit definitions of the files below\n" -"as they mirror the hardware registers.\n" -"\n" -"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" -"\t attributes of the error which caused the MCE.\n" -"\n" -"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" -"\t used for error thresholding purposes and its validity is indicated by\n" -"\t MCi_STATUS[MiscV].\n" -"\n" -"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" -"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" -"\n" -"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" -"\t associated with the error.\n" -"\n" -"cpu:\t The CPU to inject the error on.\n" -"\n" -"bank:\t Specify the bank you want to inject the error into: the number of\n" -"\t banks in a processor varies and is family/model-specific, therefore, the\n" -"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" -"\t injection.\n" -"\n" -"flags:\t Injection type to be performed. Writing to this file will trigger a\n" -"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" -"\t for AMD processors.\n" -"\n" -"\t Allowed error injection types:\n" -"\t - \"sw\": Software error injection. Decode error to a human-readable \n" -"\t format only. Safe to use.\n" -"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" -"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" -"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" -"\t before injecting.\n" -"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" -"\t error APIC interrupt handler to handle the error if the feature is \n" -"\t is present in hardware. \n" -"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" -"\t APIC interrupt handler to handle the error. \n" -"\n"; - -static ssize_t -inj_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static const struct file_operations readme_fops = { - .read = inj_readme_read, -}; - -static struct dfs_node { - char *name; - struct dentry *d; - const struct file_operations *fops; - umode_t perm; -} dfs_fls[] = { - { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, -}; - -static int __init init_mce_inject(void) -{ - unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; - - dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; -} - -static void __exit exit_mce_inject(void) -{ - - debugfs_remove_recursive(dfs_inj); - dfs_inj = NULL; - - memset(&dfs_fls, 0, sizeof(dfs_fls)); -} -module_init(init_mce_inject); -module_exit(exit_mce_inject); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Borislav Petkov "); -MODULE_AUTHOR("AMD Inc."); -MODULE_DESCRIPTION("MCE injection facility for RAS testing"); -- cgit v1.2.3 From fbe9ff9eafb66b78b79c135ebc24fd2ca5498217 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jun 2017 18:28:31 +0200 Subject: x86/mce: Get rid of register_mce_write_callback() Make the mcelog call a notifier which lands in the injector module and does the injection. This allows for mce-inject to be a normal kernel module now. Tested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/20170613162835.30750-5-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 4 -- arch/x86/kernel/cpu/mcheck/dev-mcelog.c | 47 +++++++++++++++++----- arch/x86/kernel/cpu/mcheck/mce-inject.c | 66 +++++++++---------------------- arch/x86/kernel/cpu/mcheck/mce-internal.h | 6 ++- 4 files changed, 61 insertions(+), 62 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3f9a3d2a5209..181264989db5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -285,10 +285,6 @@ int mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); -extern void register_mce_write_callback(ssize_t (*)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)); - /* Disable CMCI/polling for MCA bank claimed by firmware */ extern void mce_disable_bank(int bank); diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 9c632cb88546..a80427c30c93 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -17,6 +17,8 @@ #include "mce-internal.h" +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + static DEFINE_MUTEX(mce_chrdev_read_mutex); static char mce_helper[128]; @@ -345,24 +347,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) +void mce_unregister_injector_chain(struct notifier_block *nb) { - mce_write = fn; + blocking_notifier_chain_unregister(&mce_injector_chain, nb); } -EXPORT_SYMBOL_GPL(register_mce_write_callback); +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; } static const struct file_operations mce_chrdev_ops = { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7170186938e5..c21c1a73712a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -283,42 +283,24 @@ static void __maybe_unused raise_mce(struct mce *m) } } -#ifdef CONFIG_X86_MCELOG_LEGACY -/* Error injection interface */ -static ssize_t mce_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) { - struct mce m; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* - * There are some cases where real MSR reads could slip - * through. - */ - if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) - return -EIO; - - if ((unsigned long)usize > sizeof(struct mce)) - usize = sizeof(struct mce); - if (copy_from_user(&m, ubuf, usize)) - return -EFAULT; - - if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) - return -EINVAL; + struct mce *m = (struct mce *)data; - /* - * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. - */ - schedule_timeout(2); + if (!m) + return NOTIFY_DONE; mutex_lock(&mce_inject_mutex); - raise_mce(&m); + raise_mce(m); mutex_unlock(&mce_inject_mutex); - return usize; + + return NOTIFY_DONE; } -#endif /* CONFIG_X86_MCELOG_LEGACY */ + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; /* * Caller needs to be make sure this cpu doesn't disappear @@ -719,44 +701,34 @@ static int __init inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; -#ifdef CONFIG_X86_MCELOG_LEGACY - register_mce_write_callback(mce_write); -#endif - - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); - err = debugfs_init(); if (err) { free_cpumask_var(mce_inject_cpumask); return err; } + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + pr_info("Machine check injector initialized\n"); return 0; } -module_init(inject_init); - -/* - * Cannot tolerate unloading currently because we cannot - * guarantee all openers of mce_chrdev will get a reference to us. - */ -#ifndef CONFIG_X86_MCELOG_LEGACY static void __exit inject_exit(void) { + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + debugfs_remove_recursive(dfs_inj); dfs_inj = NULL; memset(&dfs_fls, 0, sizeof(dfs_fls)); - unregister_nmi_handler(NMI_LOCAL, "mce_notify"); - free_cpumask_var(mce_inject_cpumask); } +module_init(inject_init); module_exit(inject_exit); -#endif - MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 654ad0668d72..098530a93bb7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -100,7 +100,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) extern struct device_attribute dev_attr_trigger; #ifdef CONFIG_X86_MCELOG_LEGACY -extern void mce_work_trigger(void); +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); #else static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif -- cgit v1.2.3 From 5c99881b33b460c40ff3fbe0d6d1242f54b4f2ed Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jun 2017 18:28:32 +0200 Subject: x86/mce: Clean up include files Not really needed. Tested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/20170613162835.30750-6-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index c21c1a73712a..00af8dd8fc59 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -25,20 +25,10 @@ #include #include -#include -#include -#include -#include #include -#include #include #include #include -#include -#include -#include -#include -#include #include #include -- cgit v1.2.3 From 86d2eac5a7045933a88c97f0453f22106bb90b54 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jun 2017 18:28:33 +0200 Subject: x86/mce/mce-inject: Preset the MCE injection struct Populate the MCE injection struct before doing initial injection so that values which don't change have sane defaults. Tested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/20170613162835.30750-7-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 00af8dd8fc59..231ad23b24a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -103,6 +103,13 @@ DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; +} + /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { @@ -700,6 +707,8 @@ static int __init inject_init(void) register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); mce_register_injector_chain(&inject_nb); + setup_inj_struct(&i_mce); + pr_info("Machine check injector initialized\n"); return 0; -- cgit v1.2.3 From ec33838244c8535b23b8d24b167996fd1318bb68 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2017 18:28:34 +0200 Subject: x86/mce: Don't disable MCA banks when offlining a CPU on AMD AMD systems have non-core, shared MCA banks within a die. These banks are controlled by a master CPU per die. If this CPU is offlined then all the shared banks are disabled in addition to the CPU's core banks. Also, Fam17h systems may have SMT enabled. The MCA_CTL register is shared between SMT thread siblings. If a CPU is offlined then all its sibling's MCA banks are also disabled. Extend the existing vendor check to AMD too. Signed-off-by: Yazen Ghannam [ Fix up comment. ] Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170613162835.30750-8-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5cfbaeb6529a..3c54c2b9efc2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1912,12 +1912,13 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide. * Disabling them for just a single offlined CPU is bad, since it will * inhibit reporting for all shared resources on the socket like the * last level cache (LLC), the integrated memory controller (iMC), etc. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; mce_disable_error_reporting(); -- cgit v1.2.3 From 6057077f6ebc0747c2f5b21dfb1eb782f489d0f6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2017 18:28:35 +0200 Subject: x86/mce: Update bootlog description to reflect behavior on AMD The bootlog option is only disabled by default on AMD Fam10h and older systems. Update bootlog description to say this. Change the family value to hex to avoid confusion. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170613162835.30750-9-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 61b611e9eeaf..b297c48389b9 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -36,7 +36,8 @@ Machine check to broadcast MCEs. mce=bootlog Enable logging of machine checks left over from booting. - Disabled by default on AMD because some BIOS leave bogus ones. + Disabled by default on AMD Fam10h and older because some BIOS + leave bogus ones. If your BIOS doesn't do that it's a good idea to enable though to make sure you log even machine check events that result in a reboot. On Intel systems it is enabled by default. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3c54c2b9efc2..b58b77808ce4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1550,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 < 17 && cfg->bootlog < 0) { + if (c->x86 < 0x11 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1832,7 +1832,8 @@ void mce_disable_bank(int bank) * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable memcpy_mcsafe() -- cgit v1.2.3 From b867059018a5254cca14450eefb6fb8effa0f6dd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 20 Jun 2017 23:16:37 +0200 Subject: x86/MCE, xen/mcelog: Make /dev/mcelog registration messages more precise When running under Xen as dom0, /dev/mcelog is being provided by Xen instead of the normal mcelog character device of the MCE core. Convert an error message being issued by the MCE core in this case to an informative message that Xen has registered the device. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170614084059.19294-1-jgross@suse.com --- arch/x86/kernel/cpu/mcheck/dev-mcelog.c | 8 +++++++- drivers/xen/mcelog.c | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index a80427c30c93..10cec43aac38 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -415,9 +415,15 @@ static __init int dev_mcelog_init_device(void) /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; } + mce_register_decode_chain(&dev_mcelog_nb); return 0; } diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c index a493c7315e94..6cc1c15bcd84 100644 --- a/drivers/xen/mcelog.c +++ b/drivers/xen/mcelog.c @@ -408,6 +408,8 @@ static int __init xen_late_init_mcelog(void) if (ret) goto deregister; + pr_info("/dev/mcelog registered by Xen\n"); + return 0; deregister: -- cgit v1.2.3 From e2de64ec52659870b4fdef5bf08f265ce5fe1ccc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 26 Jun 2017 14:35:31 +0200 Subject: x86/mce: Always save severity in machine_check_poll() The MCE severity gives a hint as to how to handle the error. The notifier blocks can then use the severity to decide on an action. It's not necessary for machine_check_poll() to filter errors for the notifier chain, since each block will check its own set of conditions before handling an error. Also, there isn't any urgency for machine_check_poll() to make decisions based on severity like in do_machine_check(). If we can assume that a severity is set then we can use it in more notifier blocks. For example, the CEC block could check for a "KEEP" severity rather than checking bits in the status. This isn't possible now since the severity is not set except for "DEFFRRED/UCNA" errors with a valid address. Save the severity since we have it, and let the notifier blocks decide if they want to do anything. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1498074402-98633-1-git-send-email-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b58b77808ce4..6dde0497efc7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -673,7 +673,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { bool error_seen = false; struct mce m; - int severity; int i; this_cpu_inc(mce_poll_count); @@ -710,11 +709,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - - if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) - if (m.status & MCI_STATUS_ADDRV) - m.severity = severity; + m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to -- cgit v1.2.3