diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-14 18:43:51 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-14 18:43:51 -0700 |
commit | d88bfe1d68735595d57bd071294f664c4f054435 (patch) | |
tree | 10a12422117f364a18f3a7b629b10dfe6f99da1a /arch/x86/kernel | |
parent | e71c2c1eeb8de7a083a728c5b7e0b83ed1faf047 (diff) | |
parent | eb1af3b71f9d83e45f2fd2fd649356e98e1c582c (diff) | |
download | linux-d88bfe1d68735595d57bd071294f664c4f054435.tar.bz2 |
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Ingo Molnar:
"Various RAS updates:
- AMD MCE support updates for future CPUs, fixes and 'SMCA' (Scalable
MCA) error decoding support (Aravind Gopalakrishnan)
- x86 memcpy_mcsafe() support, to enable smart(er) hardware error
recovery in NVDIMM drivers, based on an extension of the x86
exception handling code. (Tony Luck)"
* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
EDAC/sb_edac: Fix computation of channel address
x86/mm, x86/mce: Add memcpy_mcsafe()
x86/mce/AMD: Document some functionality
x86/mce: Clarify comments regarding deferred error
x86/mce/AMD: Fix logic to obtain block address
x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors
x86/mce: Move MCx_CONFIG MSR definitions
x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
x86/mm: Expand the exception table logic to allow new handling options
x86/mce/AMD: Set MCAX Enable bit
x86/mce/AMD: Carve out threshold block preparation
x86/mce/AMD: Fix LVT offset configuration for thresholding
x86/mce/AMD: Reduce number of blocks scanned per bank
x86/mce/AMD: Do not perform shared bank check for future processors
x86/mce: Fix order of AMD MCE init function call
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 72 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 231 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/x8664_ksyms_64.c | 2 |
6 files changed, 233 insertions, 102 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 9c682c222071..5119766d9889 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <asm/mce.h> +#include <asm/uaccess.h> #include "mce-internal.h" @@ -29,7 +30,7 @@ * panic situations) */ -enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; @@ -48,6 +49,7 @@ static struct severity { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER #define EXCP .excp = EXCP_CONTEXT @@ -87,6 +89,10 @@ static struct severity { EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( DEFERRED, "Deferred error", NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), @@ -123,6 +129,11 @@ static struct severity { MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER @@ -170,6 +181,9 @@ static struct severity { ) /* always matches. keep at end */ }; +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -183,7 +197,11 @@ static struct severity { */ static int error_context(struct mce *m) { - return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + if ((m->cs & 3) == 3) + return IN_USER; + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + return IN_KERNEL_RECOV; + return IN_KERNEL; } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a006f4cd792b..524f2a8492d7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear) } } +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + if (ret) + pr_err("Memory error not recovered"); + return ret; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - u64 recover_paddr = ~0ull; - int flags = MF_ACTION_REQUIRED; int lmce = 0; /* If this CPU is offline, just bail out. */ @@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) } /* - * At insane "tolerant" levels we take no action. Otherwise - * we only die if we have no other choice. For less serious - * issues we try to recover, or limit damage to the current - * process. + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. */ - if (cfg->tolerant < 3) { - if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - if (worst == MCE_AR_SEVERITY) { - recover_paddr = m.addr; - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - } else if (kill_it) { - force_sig(SIGBUS, current); - } - } + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) mce_report_event(regs); @@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code) out: sync_core(); - if (recover_paddr == ~0ull) - goto done; + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out_ist; - pr_err("Uncorrected hardware memory error in user-access at %llx", - recover_paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - ist_begin_non_atomic(regs); - local_irq_enable(); - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + ist_begin_non_atomic(regs); + local_irq_enable(); + + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); + } else { + if (!fixup_exception(regs, X86_TRAP_MC)) + mce_panic("Failed kernel mode recovery", &m, NULL); } - local_irq_disable(); - ist_end_non_atomic(); -done: + +out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1617,10 +1619,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_AMD: { u32 ebx = cpuid_ebx(0x80000007); - mce_amd_feature_init(c); mce_flags.overflow_recov = !!(ebx & BIT(0)); mce_flags.succor = !!(ebx & BIT(1)); mce_flags.smca = !!(ebx & BIT(3)); + mce_amd_feature_init(c); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e99b15077e94..9d656fd436ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,5 +1,5 @@ /* - * (c) 2005-2015 Advanced Micro Devices, Inc. + * (c) 2005-2016 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -28,7 +28,7 @@ #include <asm/msr.h> #include <asm/trace/irq_vectors.h> -#define NR_BLOCKS 9 +#define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 @@ -49,6 +49,19 @@ #define DEF_LVT_OFF 0x2 #define DEF_INT_TYPE_APIC 0x2 +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +/* + * OS is required to set the MCAX bit to acknowledge that it is now using the + * new MSR ranges and new registers under each bank. It also means that the OS + * will configure deferred errors in the new MCx_CONFIG register. If the bit is + * not set, uncorrectable errors will cause a system panic. + */ +#define SMCA_MCAX_EN_OFF 0x1 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -58,6 +71,35 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwids[] = { + [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, + [SMCA_DF] = { "data_fabric", 0x2E }, + [SMCA_UMC] = { "umc", 0x96 }, + [SMCA_PB] = { "param_block", 0x5 }, + [SMCA_PSP] = { "psp", 0xFF }, + [SMCA_SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwids); + +const char * const amd_core_mcablock_names[] = { + [SMCA_LS] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [RES] = "", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", +}; +EXPORT_SYMBOL_GPL(amd_core_mcablock_names); + +const char * const amd_df_mcablock_names[] = { + [SMCA_CS] = "coherent_slave", + [SMCA_PIE] = "pie", +}; +EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -84,6 +126,13 @@ struct thresh_restart { static inline bool is_shared_bank(int bank) { + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + /* Bank 4 is for northbridge reporting and is thus shared */ return (bank == 4); } @@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) } if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); @@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* - * Called via smp_call_function_single(), must be called with correct - * cpu affinity. - */ +/* Reprogram MCx_MISC MSR behind this threshold bank. */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block) +{ + u32 addr = 0, offset = 0; + + if (mce_flags.smca) { + if (!block) { + addr = MSR_AMD64_SMCA_MCx_MISC(bank); + } else { + /* + * For SMCA enabled processors, BLKPTR field of the + * first MISC register (MCx_MISC0) indicates presence of + * additional MISC register set (MISC1-4). + */ + u32 low, high; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return addr; + + if (!(low & MCI_CONFIG_MCAX)) + return addr; + + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } + return addr; + } + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = MSR_IA32_MCx_MISC(bank); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (mce_flags.smca) { + u32 smca_low, smca_high; + u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { + smca_high |= SMCA_MCAX_EN_OFF; + wrmsr(smca_addr, smca_low, smca_high); + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + } else { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + } + + offset = setup_APIC_mce_threshold(offset, new); + + if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) + mce_threshold_vector = amd_threshold_interrupt; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - struct threshold_block b; - unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1, new; + int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MCx_MISC(bank); - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - - address += MCG_XBLK_ADDR; - } else - ++address; + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) (high & MASK_LOCKED_HI)) continue; - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); - - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; - b.interrupt_capable = lvt_interrupt_supported(bank, high); - - if (!b.interrupt_capable) - goto init; - - b.interrupt_enable = 1; - new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce_threshold(offset, new); - - if ((offset == new) && - (mce_threshold_vector != amd_threshold_interrupt)) - mce_threshold_vector = amd_threshold_interrupt; - -init: - mce_threshold_block_init(&b, offset); + offset = prepare_threshold_block(bank, block, address, offset, high); } } @@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) { - address = MSR_IA32_MCx_MISC(bank); - } else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, ++block); + if (!address) + return 0; - err = allocate_threshold_blocks(cpu, bank, ++block, address); + err = allocate_threshold_blocks(cpu, bank, block, address); if (err) goto out_free; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 1deffe6cc873..0f05deeff5ce 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - if (fixup_exception(regs)) + if (fixup_exception(regs, trapnr)) return 1; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a46b1d..211c11c7bba4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code) tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs)) + if (fixup_exception(regs, X86_TRAP_GP)) return; tsk->thread.error_code = error_code; @@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) conditional_sti(regs); if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { task->thread.error_code = error_code; task->thread.trap_nr = trapnr; die(str, regs, error_code); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19864..cd05942bc918 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); +EXPORT_SYMBOL_GPL(memcpy_mcsafe); + EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); |