From 1717f2096b543cede7a380c858c765c41936bc35 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:09 +0100 Subject: panic, x86: Fix re-entrance problem due to panic on NMI If panic on NMI happens just after panic() on the same CPU, panic() is recursively called. Kernel stalls, as a result, after failing to acquire panic_lock. To avoid this problem, don't call panic() in NMI context if we've already entered panic(). For that, introduce nmi_panic() macro to reduce code duplication. In the case of panic on NMI, don't return from NMI handlers if another CPU already panicked. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Aaron Tomlin Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Chris Metcalf Cc: David Hildenbrand Cc: Don Zickus Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Gobinda Charan Maji Cc: HATAYAMA Daisuke Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Javi Merino Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: lkml Cc: Masami Hiramatsu Cc: Michal Nazarewicz Cc: Nicolas Iooss Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Rasmus Villemoes Cc: Rusty Russell Cc: Seth Jennings Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ulrich Obergfell Cc: Vitaly Kuznetsov Cc: Vivek Goyal Link: http://lkml.kernel.org/r/20151210014626.25437.13302.stgit@softrs [ Cleanup comments, fixup formatting. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- include/linux/kernel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 350dfb08aee3..750cc5c7c999 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -445,6 +445,26 @@ extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; +/* + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It + * holds a CPU number which is executing panic() currently. A value of + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). + */ +extern atomic_t panic_cpu; +#define PANIC_CPU_INVALID -1 + +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. + */ +#define nmi_panic(fmt, ...) \ +do { \ + int cpu = raw_smp_processor_id(); \ + \ + if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu) \ + panic(fmt, ##__VA_ARGS__); \ +} while (0) + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. -- cgit v1.2.3 From 58c5661f2144c089bbc2e5d87c9ec1dc1d2964fe Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:10 +0100 Subject: panic, x86: Allow CPUs to save registers even if looping in NMI context Currently, kdump_nmi_shootdown_cpus(), a subroutine of crash_kexec(), sends an NMI IPI to CPUs which haven't called panic() to stop them, save their register information and do some cleanups for crash dumping. However, if such a CPU is infinitely looping in NMI context, we fail to save its register information into the crash dump. For example, this can happen when unknown NMIs are broadcast to all CPUs as follows: CPU 0 CPU 1 =========================== ========================== receive an unknown NMI unknown_nmi_error() panic() receive an unknown NMI spin_trylock(&panic_lock) unknown_nmi_error() crash_kexec() panic() spin_trylock(&panic_lock) panic_smp_self_stop() infinite loop kdump_nmi_shootdown_cpus() issue NMI IPI -----------> blocked until IRET infinite loop... Here, since CPU 1 is in NMI context, the second NMI from CPU 0 is blocked until CPU 1 executes IRET. However, CPU 1 never executes IRET, so the NMI is not handled and the callback function to save registers is never called. In practice, this can happen on some servers which broadcast NMIs to all CPUs when the NMI button is pushed. To save registers in this case, we need to: a) Return from NMI handler instead of looping infinitely or b) Call the callback function directly from the infinite loop Inherently, a) is risky because NMI is also used to prevent corrupted data from being propagated to devices. So, we chose b). This patch does the following: 1. Move the infinite looping of CPUs which haven't called panic() in NMI context (actually done by panic_smp_self_stop()) outside of panic() to enable us to refer pt_regs. Please note that panic_smp_self_stop() is still used for normal context. 2. Call a callback of kdump_nmi_shootdown_cpus() directly to save registers and do some cleanups after setting waiting_for_crash_ipi which is used for counting down the number of CPUs which handled the callback Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Aaron Tomlin Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Chris Metcalf Cc: Dave Young Cc: David Hildenbrand Cc: Don Zickus Cc: Eric Biederman Cc: Frederic Weisbecker Cc: Gobinda Charan Maji Cc: HATAYAMA Daisuke Cc: Hidehiro Kawai Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Javi Merino Cc: Jiang Liu Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: lkml Cc: Masami Hiramatsu Cc: Michal Nazarewicz Cc: Nicolas Iooss Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Rasmus Villemoes Cc: Seth Jennings Cc: Stefan Lippers-Hollmann Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ulrich Obergfell Cc: Vitaly Kuznetsov Cc: Vivek Goyal Cc: Yasuaki Ishimatsu Link: http://lkml.kernel.org/r/20151210014628.25437.75256.stgit@softrs [ Cleanup comments, fixup formatting. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi.c | 6 +++--- arch/x86/kernel/reboot.c | 20 ++++++++++++++++++++ include/linux/kernel.h | 16 ++++++++++++---- kernel/panic.c | 9 +++++++++ kernel/watchdog.c | 2 +- 5 files changed, 45 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index fca87938d739..424aec4a4c71 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -256,7 +256,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) show_regs(regs); if (panic_on_io_nmi) { - nmi_panic("NMI IOCK error: Not continuing"); + nmi_panic(regs, "NMI IOCK error: Not continuing"); /* * If we end up here, it means we have received an NMI while @@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - nmi_panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 02693dd9a079..1da13022d544 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -718,6 +718,7 @@ static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { @@ -780,6 +781,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) smp_send_nmi_allbutself(); + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); @@ -788,6 +792,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* + * Wait for the crash dumping IPI to be issued, and then + * call its callback directly. + */ + if (READ_ONCE(crash_ipi_issued)) + crash_nmi_callback(0, regs); /* Don't return */ + + cpu_relax(); + } +} + #else /* !CONFIG_SMP */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 750cc5c7c999..7311c3294e25 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -255,6 +255,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; +void nmi_panic_self_stop(struct pt_regs *); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -455,14 +456,21 @@ extern atomic_t panic_cpu; /* * A variant of panic() called from NMI context. We return if we've already - * panicked on this CPU. + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. */ -#define nmi_panic(fmt, ...) \ +#define nmi_panic(regs, fmt, ...) \ do { \ - int cpu = raw_smp_processor_id(); \ + int old_cpu, cpu; \ \ - if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu) \ + cpu = raw_smp_processor_id(); \ + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); \ + \ + if (old_cpu == PANIC_CPU_INVALID) \ panic(fmt, ##__VA_ARGS__); \ + else if (old_cpu != cpu) \ + nmi_panic_self_stop(regs); \ } while (0) /* diff --git a/kernel/panic.c b/kernel/panic.c index 3344524cf6ff..06f31b49b3b4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,15 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourselves in NMI context if another CPU has already panicked. Arch code + * may override this to prepare for crash dumping, e.g. save regs info. + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b9be18fae154..84b5035cb6a5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - nmi_panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- cgit v1.2.3 From 7bbee5ca3896f69f09c68be549cb8997abe6bca6 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Mon, 14 Dec 2015 11:19:11 +0100 Subject: kexec: Fix race between panic() and crash_kexec() Currently, panic() and crash_kexec() can be called at the same time. For example (x86 case): CPU 0: oops_end() crash_kexec() mutex_trylock() // acquired nmi_shootdown_cpus() // stop other CPUs CPU 1: panic() crash_kexec() mutex_trylock() // failed to acquire smp_send_stop() // stop other CPUs infinite loop If CPU 1 calls smp_send_stop() before nmi_shootdown_cpus(), kdump fails. In another case: CPU 0: oops_end() crash_kexec() mutex_trylock() // acquired io_check_error() panic() crash_kexec() mutex_trylock() // failed to acquire infinite loop Clearly, this is an undesirable result. To fix this problem, this patch changes crash_kexec() to exclude others by using the panic_cpu atomic. Signed-off-by: Hidehiro Kawai Acked-by: Michal Hocko Cc: Andrew Morton Cc: Baoquan He Cc: Dave Young Cc: "Eric W. Biederman" Cc: HATAYAMA Daisuke Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: kexec@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: Martin Schwidefsky Cc: Masami Hiramatsu Cc: Minfei Huang Cc: Peter Zijlstra Cc: Seth Jennings Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Vivek Goyal Cc: x86-ml Link: http://lkml.kernel.org/r/20151210014630.25437.94161.stgit@softrs Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- include/linux/kexec.h | 2 ++ kernel/kexec_core.c | 30 +++++++++++++++++++++++++++++- kernel/panic.c | 8 ++++++-- 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d140b1e9faa7..7b68d2788a56 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -237,6 +237,7 @@ extern int kexec_purgatory_get_set_symbol(struct kimage *image, unsigned int size, bool get_value); extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); @@ -332,6 +333,7 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; +static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } #define kexec_in_progress false diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 11b64a63c0f8..c823f3001e12 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -853,7 +853,12 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; -void crash_kexec(struct pt_regs *regs) +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -876,6 +881,29 @@ void crash_kexec(struct pt_regs *regs) } } +void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + size_t crash_get_memory_size(void) { size_t size = 0; diff --git a/kernel/panic.c b/kernel/panic.c index 06f31b49b3b4..b333380c6bb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -136,9 +136,11 @@ void panic(const char *fmt, ...) * everything else. * If we want to run this after calling panic_notifiers, pass * the "crash_kexec_post_notifiers" option to the kernel. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -161,9 +163,11 @@ void panic(const char *fmt, ...) * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); bust_spinlocks(0); -- cgit v1.2.3