From 3170d8d226c2053355f3946b4b5ded4c006fe6d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 May 2017 20:06:33 -0400 Subject: kill {__,}{get,put}_user_unaligned() no users left Signed-off-by: Al Viro --- arch/powerpc/include/asm/uaccess.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 5c0d8a8cdae5..9e7b08bbde5b 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -90,9 +90,6 @@ #define __put_user_inatomic(x, ptr) \ __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user - extern long __put_user_bad(void); /* -- cgit v1.2.3 From 1c0eaf0f56d6128af7f0f252855173fcee85d202 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 30 Jun 2017 17:37:32 -0500 Subject: powerpc/powernv: Tell OPAL about our MMU mode on POWER9 That will allow OPAL to configure the CPU in an optimal way. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 9 +++++++++ arch/powerpc/platforms/powernv/opal.c | 19 +++++++++++++++++-- arch/powerpc/platforms/powernv/setup.c | 11 ++++++++++- 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index ef930ba500f9..3130a73652c7 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -876,6 +876,15 @@ struct OpalIoPhb4ErrorData { enum { OPAL_REINIT_CPUS_HILE_BE = (1 << 0), OPAL_REINIT_CPUS_HILE_LE = (1 << 1), + + /* These two define the base MMU mode of the host on P9 + * + * On P9 Nimbus DD2.0 and Cumlus (and later), KVM can still + * create hash guests in "radix" mode with care (full core + * switch only). + */ + OPAL_REINIT_CPUS_MMU_HASH = (1 << 2), + OPAL_REINIT_CPUS_MMU_RADIX = (1 << 3), }; typedef struct oppanel_line { diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 59684b4af4d1..9b87abb178f0 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -59,6 +59,8 @@ static struct task_struct *kopald_tsk; void opal_configure_cores(void) { + u64 reinit_flags = 0; + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... * * It will preserve non volatile GPRs and HSPRG0/1. It will @@ -66,11 +68,24 @@ void opal_configure_cores(void) * but it might clobber a bunch. */ #ifdef __BIG_ENDIAN__ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_BE; #else - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_LE; #endif + /* + * POWER9 always support running hash: + * ie. Host hash supports hash guests + * Host radix supports hash/radix guests + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH; + if (early_radix_enabled()) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX; + } + + opal_reinit_cpus(reinit_flags); + /* Restore some bits */ if (cur_cpu_spec->cpu_restore) cur_cpu_spec->cpu_restore(); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 2dc7e5fb86c3..897aa1400eb8 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -225,6 +225,8 @@ static void pnv_kexec_wait_secondaries_down(void) static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { + u64 reinit_flags; + if (xive_enabled()) xive_kexec_teardown_cpu(secondary); else @@ -254,8 +256,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) * We might be running as little-endian - now that interrupts * are disabled, reset the HILE bit to big-endian so we don't * take interrupts in the wrong endian later + * + * We reinit to enable both radix and hash on P9 to ensure + * the mode used by the next kernel is always supported. */ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags = OPAL_REINIT_CPUS_HILE_BE; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX | + OPAL_REINIT_CPUS_MMU_HASH; + opal_reinit_cpus(reinit_flags); } } #endif /* CONFIG_KEXEC_CORE */ -- cgit v1.2.3 From 2400fd822f467cb4c886c879d8ad99feac9cf319 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 6 Jul 2017 18:46:43 +1000 Subject: powerpc/asm: Mark cr0 as clobbered in mftb() The workaround for the CELL timebase bug does not correctly mark cr0 as being clobbered. This means GCC doesn't know that the asm block changes cr0 and might leave the result of an unrelated comparison in cr0 across the block, which we then trash, leading to basically random behaviour. Fixes: 859deea949c3 ("[POWERPC] Cell timebase bug workaround") Cc: stable@vger.kernel.org # v2.6.19+ Signed-off-by: Oliver O'Halloran [mpe: Tweak change log and flag for stable] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 7e50e47375d6..a3b6575c7842 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1303,7 +1303,7 @@ static inline void msr_check_and_clear(unsigned long bits) " .llong 0\n" \ ".previous" \ : "=r" (rval) \ - : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL)); \ + : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \ rval;}) #else #define mftb() ({unsigned long rval; \ -- cgit v1.2.3 From 01e6a61aceb82e13bec29502a8eb70d9574f97ad Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 11 Jul 2017 22:10:54 +1000 Subject: powerpc/64: Fix atomic64_inc_not_zero() to return an int Although it's not documented anywhere, there is an expectation that atomic64_inc_not_zero() returns a result which fits in an int. This is the behaviour implemented on all arches except powerpc. This has caused at least one bug in practice, in the percpu-refcount code, where the long result from our atomic64_inc_not_zero() was truncated to an int leading to lost references and stuck systems. That was worked around in that code in commit 966d2b04e070 ("percpu-refcount: fix reference leak during percpu-atomic transition"). To the best of my grepping abilities there are no other callers in-tree which truncate the value, but we should fix it anyway. Because the breakage is subtle and potentially very harmful I'm also tagging it for stable. Code generation is largely unaffected because in most cases the callers are just using the result for a test anyway. In particular the case of fget() that was mentioned in commit a6cf7ed5119f ("powerpc/atomic: Implement atomic*_inc_not_zero") generates exactly the same code. Fixes: a6cf7ed5119f ("powerpc/atomic: Implement atomic*_inc_not_zero") Cc: stable@vger.kernel.org # v3.4 Noticed-by: Linus Torvalds Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 2b90335194a7..a2cc8010cd72 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -560,7 +560,7 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) * Atomically increments @v by 1, so long as @v is non-zero. * Returns non-zero if @v was non-zero, and zero otherwise. */ -static __inline__ long atomic64_inc_not_zero(atomic64_t *v) +static __inline__ int atomic64_inc_not_zero(atomic64_t *v) { long t1, t2; @@ -579,7 +579,7 @@ static __inline__ long atomic64_inc_not_zero(atomic64_t *v) : "r" (&v->counter) : "cc", "xer", "memory"); - return t1; + return t1 != 0; } #endif /* __powerpc64__ */ -- cgit v1.2.3 From 2104180a53698df5aec35aed5f840a26ade0551d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:52 -0700 Subject: powerpc/64s: implement arch-specific hardlockup watchdog Implement an arch-speicfic watchdog rather than use the perf-based hardlockup detector. The new watchdog takes the soft-NMI directly, rather than going through perf. Perf interrupts are to be made maskable in future, so that would prevent the perf detector from working in those regions. Additionally, implement a SMP based detector where all CPUs watch one another by pinging a shared cpumask. This is because powerpc Book3S does not have a true periodic local NMI, but some platforms do implement a true NMI IPI. If a CPU is stuck with interrupts hard disabled, the soft-NMI watchdog does not work, but the SMP watchdog will. Even on platforms without a true NMI IPI to get a good trace from the stuck CPU, other CPUs will notice the lockup sufficiently to report it and panic. [npiggin@gmail.com: honor watchdog disable at boot/hotplug] Link: http://lkml.kernel.org/r/20170621001346.5bb337c9@roar.ozlabs.ibm.com [npiggin@gmail.com: fix false positive warning at CPU unplug] Link: http://lkml.kernel.org/r/20170630080740.20766-1-npiggin@gmail.com [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170616065715.18390-6-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 7 +- arch/powerpc/include/asm/nmi.h | 11 + arch/powerpc/include/asm/smp.h | 2 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/exceptions-64s.S | 30 ++- arch/powerpc/kernel/kvm.c | 7 + arch/powerpc/kernel/setup_64.c | 19 -- arch/powerpc/kernel/smp.c | 20 +- arch/powerpc/kernel/watchdog.c | 386 +++++++++++++++++++++++++++++++++++ 9 files changed, 458 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/kernel/watchdog.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 63ed758e1d20..fce2f4f20891 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -82,7 +82,7 @@ config NR_IRQS config NMI_IPI bool - depends on SMP && (DEBUGGER || KEXEC_CORE) + depends on SMP && (DEBUGGER || KEXEC_CORE || HARDLOCKUP_DETECTOR) default y config STACKTRACE_SUPPORT @@ -192,12 +192,13 @@ config PPC select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC - select HAVE_NMI if PERF_EVENTS + select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) + select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) select HAVE_OPROFILE select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index ff1ccb375e60..6f8e79cd35d8 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -1,4 +1,15 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +#ifdef CONFIG_HARDLOCKUP_DETECTOR +extern void arch_touch_nmi_watchdog(void); + +extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace + +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index ebddb2111d87..8ea98504f900 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -55,6 +55,8 @@ struct smp_ops_t { int (*cpu_bootable)(unsigned int nr); }; +extern void smp_flush_nmi_ipi(u64 delay_us); +extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern void smp_send_debugger_break(void); extern void start_secondary_resume(void); extern void smp_generic_give_timebase(void); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 0845eebc5af3..4aa7c147e447 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4c18a5fbb4bb..e6d8354d79ef 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1314,6 +1314,31 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif +#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) + +#define MASKED_DEC_HANDLER_LABEL 3f + +#define MASKED_DEC_HANDLER(_H) \ +3: /* soft-nmi */ \ + std r12,PACA_EXGEN+EX_R12(r13); \ + GET_SCRATCH0(r10); \ + std r10,PACA_EXGEN+EX_R13(r13); \ + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) + +EXC_COMMON_BEGIN(soft_nmi_common) + mr r10,r1 + ld r1,PACAEMERGSP(r13) + ld r1,PACA_NMI_EMERG_SP(r13) + subi r1,r1,INT_FRAME_SIZE + EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, + system_reset, soft_nmi_interrupt, + ADD_NVGPRS;ADD_RECONCILE) + b ret_from_except + +#else +#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ +#define MASKED_DEC_HANDLER(_H) +#endif /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1336,7 +1361,7 @@ masked_##_H##interrupt: \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ - b 2f; \ + b MASKED_DEC_HANDLER_LABEL; \ 1: cmpwi r10,PACA_IRQ_DBELL; \ beq 2f; \ cmpwi r10,PACA_IRQ_HMI; \ @@ -1351,7 +1376,8 @@ masked_##_H##interrupt: \ ld r11,PACA_EXGEN+EX_R11(r13); \ GET_SCRATCH0(r13); \ ##_H##rfid; \ - b . + b .; \ + MASKED_DEC_HANDLER(_H) /* * Real mode exceptions actually use this too, but alternate diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 9ad37f827a97..1086ea37c832 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,6 +25,7 @@ #include #include #include +#include /* hardlockup_detector_disable() */ #include #include @@ -718,6 +719,12 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { + /* + * The hardlockup detector is likely to get false positives in + * KVM guests, so disable it by default. + */ + hardlockup_detector_disable(); + if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 074a075a9cdb..af23d4b576ec 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -751,22 +751,3 @@ unsigned long memory_block_size_bytes(void) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif - -#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF -u64 hw_nmi_get_sample_period(int watchdog_thresh) -{ - return ppc_proc_freq * watchdog_thresh; -} - -/* - * The hardlockup detector breaks PMU event based branches and is likely - * to get false positives in KVM guests, so disable it by default. - */ -static int __init disable_hardlockup_detector(void) -{ - hardlockup_detector_disable(); - - return 0; -} -early_initcall(disable_hardlockup_detector); -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index c6b8bace1766..997c88d54acf 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -435,13 +435,31 @@ static void do_smp_send_nmi_ipi(int cpu) } } +void smp_flush_nmi_ipi(u64 delay_us) +{ + unsigned long flags; + + nmi_ipi_lock_start(&flags); + while (nmi_ipi_busy_count) { + nmi_ipi_unlock_end(&flags); + udelay(1); + if (delay_us) { + delay_us--; + if (!delay_us) + return; + } + nmi_ipi_lock_start(&flags); + } + nmi_ipi_unlock_end(&flags); +} + /* * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to * enter the handler, == 0 specifies indefinite delay. */ -static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) +int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) { unsigned long flags; int me = raw_smp_processor_id(); diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c new file mode 100644 index 000000000000..b67f8b03a32d --- /dev/null +++ b/arch/powerpc/kernel/watchdog.c @@ -0,0 +1,386 @@ +/* + * Watchdog support on powerpc systems. + * + * Copyright 2017, IBM Corporation. + * + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The watchdog has a simple timer that runs on each CPU, once per timer + * period. This is the heartbeat. + * + * Then there are checks to see if the heartbeat has not triggered on a CPU + * for the panic timeout period. Currently the watchdog only supports an + * SMP check, so the heartbeat only turns on when we have 2 or more CPUs. + * + * This is not an NMI watchdog, but Linux uses that name for a generic + * watchdog in some cases, so NMI gets used in some places. + */ + +static cpumask_t wd_cpus_enabled __read_mostly; + +static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */ +static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ + +static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ + +static DEFINE_PER_CPU(struct timer_list, wd_timer); +static DEFINE_PER_CPU(u64, wd_timer_tb); + +/* + * These are for the SMP checker. CPUs clear their pending bit in their + * heartbeat. If the bitmask becomes empty, the time is noted and the + * bitmask is refilled. + * + * All CPUs clear their bit in the pending mask every timer period. + * Once all have cleared, the time is noted and the bits are reset. + * If the time since all clear was greater than the panic timeout, + * we can panic with the list of stuck CPUs. + * + * This will work best with NMI IPIs for crash code so the stuck CPUs + * can be pulled out to get their backtraces. + */ +static unsigned long __wd_smp_lock; +static cpumask_t wd_smp_cpus_pending; +static cpumask_t wd_smp_cpus_stuck; +static u64 wd_smp_last_reset_tb; + +static inline void wd_smp_lock(unsigned long *flags) +{ + /* + * Avoid locking layers if possible. + * This may be called from low level interrupt handlers at some + * point in future. + */ + local_irq_save(*flags); + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) + cpu_relax(); +} + +static inline void wd_smp_unlock(unsigned long *flags) +{ + clear_bit_unlock(0, &__wd_smp_lock); + local_irq_restore(*flags); +} + +static void wd_lockup_ipi(struct pt_regs *regs) +{ + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", raw_smp_processor_id()); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); +} + +static void set_cpu_stuck(int cpu, u64 tb) +{ + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } +} + +static void watchdog_smp_panic(int cpu, u64 tb) +{ + unsigned long flags; + int c; + + wd_smp_lock(&flags); + /* Double check some things under lock */ + if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + goto out; + if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) + goto out; + if (cpumask_weight(&wd_smp_cpus_pending) == 0) + goto out; + + pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", + cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + + /* + * Try to trigger the stuck CPUs. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); + + /* Take the stuck CPU out of the watch group */ + for_each_cpu(c, &wd_smp_cpus_pending) + set_cpu_stuck(c, tb); + +out: + wd_smp_unlock(&flags); + + printk_safe_flush(); + /* + * printk_safe_flush() seems to require another print + * before anything actually goes out to console. + */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(NULL, "Hard LOCKUP"); +} + +static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +{ + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { + if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { + unsigned long flags; + + pr_emerg("Watchdog CPU:%d became unstuck\n", cpu); + wd_smp_lock(&flags); + cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); + wd_smp_unlock(&flags); + } + return; + } + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + unsigned long flags; + + wd_smp_lock(&flags); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } + wd_smp_unlock(&flags); + } +} + +static void watchdog_timer_interrupt(int cpu) +{ + u64 tb = get_tb(); + + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_clear_cpu_pending(cpu, tb); + + if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) + watchdog_smp_panic(cpu, tb); +} + +void soft_nmi_interrupt(struct pt_regs *regs) +{ + unsigned long flags; + int cpu = raw_smp_processor_id(); + u64 tb; + + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return; + + nmi_enter(); + tb = get_tb(); + if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_lock(&flags); + if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { + wd_smp_unlock(&flags); + goto out; + } + set_cpu_stuck(cpu, tb); + + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + wd_smp_unlock(&flags); + + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + } + if (wd_panic_timeout_tb < 0x7fffffff) + mtspr(SPRN_DEC, wd_panic_timeout_tb); + +out: + nmi_exit(); +} + +static void wd_timer_reset(unsigned int cpu, struct timer_list *t) +{ + t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms); + if (wd_timer_period_ms > 1000) + t->expires = __round_jiffies_up(t->expires, cpu); + add_timer_on(t, cpu); +} + +static void wd_timer_fn(unsigned long data) +{ + struct timer_list *t = this_cpu_ptr(&wd_timer); + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); + + wd_timer_reset(cpu, t); +} + +void arch_touch_nmi_watchdog(void) +{ + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +static void start_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + per_cpu(wd_timer_tb, cpu) = get_tb(); + + setup_pinned_timer(t, wd_timer_fn, 0); + wd_timer_reset(cpu, t); +} + +static void stop_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + del_timer_sync(t); +} + +static int start_wd_on_cpu(unsigned int cpu) +{ + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { + WARN_ON(1); + return 0; + } + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return 0; + + if (watchdog_suspended) + return 0; + + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return 0; + + cpumask_set_cpu(cpu, &wd_cpus_enabled); + if (cpumask_weight(&wd_cpus_enabled) == 1) { + cpumask_set_cpu(cpu, &wd_smp_cpus_pending); + wd_smp_last_reset_tb = get_tb(); + } + smp_wmb(); + start_watchdog_timer_on(cpu); + + return 0; +} + +static int stop_wd_on_cpu(unsigned int cpu) +{ + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return 0; /* Can happen in CPU unplug case */ + + stop_watchdog_timer_on(cpu); + + cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_clear_cpu_pending(cpu, get_tb()); + + return 0; +} + +static void watchdog_calc_timeouts(void) +{ + wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + + /* Have the SMP detector trigger a bit later */ + wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; + + /* 2/5 is the factor that the perf based detector uses */ + wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; +} + +void watchdog_nmi_reconfigure(void) +{ + int cpu; + + watchdog_calc_timeouts(); + + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); +} + +/* + * This runs after lockup_detector_init() which sets up watchdog_cpumask. + */ +static int __init powerpc_watchdog_init(void) +{ + int err; + + watchdog_calc_timeouts(); + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) + pr_warn("Watchdog could not be initialized"); + + return 0; +} +arch_initcall(powerpc_watchdog_init); + +static void handle_backtrace_ipi(struct pt_regs *regs) +{ + nmi_cpu_backtrace(regs); +} + +static void raise_backtrace_ipi(cpumask_t *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu == smp_processor_id()) + handle_backtrace_ipi(NULL); + else + smp_send_nmi_ipi(cpu, handle_backtrace_ipi, 1000000); + } +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); +} -- cgit v1.2.3 From dcda9b04713c3f6ff0875652924844fae28286ea Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 12 Jul 2017 14:36:45 -0700 Subject: mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic __GFP_REPEAT was designed to allow retry-but-eventually-fail semantic to the page allocator. This has been true but only for allocations requests larger than PAGE_ALLOC_COSTLY_ORDER. It has been always ignored for smaller sizes. This is a bit unfortunate because there is no way to express the same semantic for those requests and they are considered too important to fail so they might end up looping in the page allocator for ever, similarly to GFP_NOFAIL requests. Now that the whole tree has been cleaned up and accidental or misled usage of __GFP_REPEAT flag has been removed for !costly requests we can give the original flag a better name and more importantly a more useful semantic. Let's rename it to __GFP_RETRY_MAYFAIL which tells the user that the allocator would try really hard but there is no promise of a success. This will work independent of the order and overrides the default allocator behavior. Page allocator users have several levels of guarantee vs. cost options (take GFP_KERNEL as an example) - GFP_KERNEL & ~__GFP_RECLAIM - optimistic allocation without _any_ attempt to free memory at all. The most light weight mode which even doesn't kick the background reclaim. Should be used carefully because it might deplete the memory and the next user might hit the more aggressive reclaim - GFP_KERNEL & ~__GFP_DIRECT_RECLAIM (or GFP_NOWAIT)- optimistic allocation without any attempt to free memory from the current context but can wake kswapd to reclaim memory if the zone is below the low watermark. Can be used from either atomic contexts or when the request is a performance optimization and there is another fallback for a slow path. - (GFP_KERNEL|__GFP_HIGH) & ~__GFP_DIRECT_RECLAIM (aka GFP_ATOMIC) - non sleeping allocation with an expensive fallback so it can access some portion of memory reserves. Usually used from interrupt/bh context with an expensive slow path fallback. - GFP_KERNEL - both background and direct reclaim are allowed and the _default_ page allocator behavior is used. That means that !costly allocation requests are basically nofail but there is no guarantee of that behavior so failures have to be checked properly by callers (e.g. OOM killer victim is allowed to fail currently). - GFP_KERNEL | __GFP_NORETRY - overrides the default allocator behavior and all allocation requests fail early rather than cause disruptive reclaim (one round of reclaim in this implementation). The OOM killer is not invoked. - GFP_KERNEL | __GFP_RETRY_MAYFAIL - overrides the default allocator behavior and all allocation requests try really hard. The request will fail if the reclaim cannot make any progress. The OOM killer won't be triggered. - GFP_KERNEL | __GFP_NOFAIL - overrides the default allocator behavior and all allocation requests will loop endlessly until they succeed. This might be really dangerous especially for larger orders. Existing users of __GFP_REPEAT are changed to __GFP_RETRY_MAYFAIL because they already had their semantic. No new users are added. __alloc_pages_slowpath is changed to bail out for __GFP_RETRY_MAYFAIL if there is no progress and we have already passed the OOM point. This means that all the reclaim opportunities have been exhausted except the most disruptive one (the OOM killer) and a user defined fallback behavior is more sensible than keep retrying in the page allocator. [akpm@linux-foundation.org: fix arch/sparc/kernel/mdesc.c] [mhocko@suse.com: semantic fix] Link: http://lkml.kernel.org/r/20170626123847.GM11534@dhcp22.suse.cz [mhocko@kernel.org: address other thing spotted by Vlastimil] Link: http://lkml.kernel.org/r/20170626124233.GN11534@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170623085345.11304-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Belits Cc: Chris Wilson Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DMA-ISA-LPC.txt | 2 +- arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/sparc/kernel/mdesc.c | 2 +- drivers/mmc/host/wbsd.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/target/target_core_transport.c | 2 +- drivers/vhost/net.c | 2 +- drivers/vhost/scsi.c | 2 +- drivers/vhost/vsock.c | 2 +- include/linux/gfp.h | 56 +++++++++++++++++++++------- include/linux/slab.h | 3 +- include/trace/events/mmflags.h | 2 +- mm/hugetlb.c | 4 +- mm/internal.h | 2 +- mm/page_alloc.c | 14 +++++-- mm/sparse-vmemmap.c | 4 +- mm/util.c | 6 +-- mm/vmalloc.c | 2 +- mm/vmscan.c | 8 ++-- net/core/dev.c | 6 +-- net/core/skbuff.c | 2 +- net/sched/sch_fq.c | 2 +- tools/perf/builtin-kmem.c | 2 +- 24 files changed, 86 insertions(+), 47 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/Documentation/DMA-ISA-LPC.txt b/Documentation/DMA-ISA-LPC.txt index c41331398752..7a065ac4a9d1 100644 --- a/Documentation/DMA-ISA-LPC.txt +++ b/Documentation/DMA-ISA-LPC.txt @@ -42,7 +42,7 @@ requirements you pass the flag GFP_DMA to kmalloc. Unfortunately the memory available for ISA DMA is scarce so unless you allocate the memory during boot-up it's a good idea to also pass -__GFP_REPEAT and __GFP_NOWARN to make the allocator try a bit harder. +__GFP_RETRY_MAYFAIL and __GFP_NOWARN to make the allocator try a bit harder. (This scarcity also means that you should allocate the buffer as early as possible and not release it until the driver is unloaded.) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 20b1485ff1e8..e2329db9d6f4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -56,7 +56,7 @@ static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) return (pgd_t *)__get_free_page(pgtable_gfp_flags(mm, PGALLOC_GFP)); #else struct page *page; - page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_REPEAT), + page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_RETRY_MAYFAIL), 4); if (!page) return NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 710e491206ed..8cb0190e2a73 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -93,7 +93,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) } if (!hpt) - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index e4b4e790bf89..fa466ce45bc9 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -205,7 +205,7 @@ static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) handle_size = (sizeof(struct mdesc_handle) - sizeof(struct mdesc_hdr) + mdesc_size); - base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_REPEAT); + base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!base) return NULL; diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index e15a9733fcfd..9668616faf16 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -1386,7 +1386,7 @@ static void wbsd_request_dma(struct wbsd_host *host, int dma) * order for ISA to be able to DMA to it. */ host->dma_buffer = kmalloc(WBSD_DMA_SIZE, - GFP_NOIO | GFP_DMA | __GFP_REPEAT | __GFP_NOWARN); + GFP_NOIO | GFP_DMA | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!host->dma_buffer) goto free; diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 65f5a794f26d..98749fa817da 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -98,7 +98,7 @@ vmcp_write(struct file *file, const char __user *buff, size_t count, } if (!session->response) session->response = (char *)__get_free_pages(GFP_KERNEL - | __GFP_REPEAT | GFP_DMA, + | __GFP_RETRY_MAYFAIL | GFP_DMA, get_order(session->bufsize)); if (!session->response) { mutex_unlock(&session->mutex); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index f1b3a46bdcaf..1bdc10651bcd 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -252,7 +252,7 @@ int transport_alloc_session_tags(struct se_session *se_sess, int rc; se_sess->sess_cmd_map = kzalloc(tag_num * tag_size, - GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); if (!se_sess->sess_cmd_map) { se_sess->sess_cmd_map = vzalloc(tag_num * tag_size); if (!se_sess->sess_cmd_map) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e3d7ea1288c6..06d044862e58 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -897,7 +897,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) struct sk_buff **queue; int i; - n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_REPEAT); + n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index fd6c8b66f06f..ff02a942c4d5 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1404,7 +1404,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) struct vhost_virtqueue **vqs; int r = -ENOMEM, i; - vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); if (!vs) { vs = vzalloc(sizeof(*vs)); if (!vs) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 3f63e03de8e8..c9de9c41aa97 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -508,7 +508,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ - vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_REPEAT); + vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4c6656f1fee7..bcfb9f7c46f5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -25,7 +25,7 @@ struct vm_area_struct; #define ___GFP_FS 0x80u #define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u -#define ___GFP_REPEAT 0x400u +#define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u #define ___GFP_NORETRY 0x1000u #define ___GFP_MEMALLOC 0x2000u @@ -136,26 +136,56 @@ struct vm_area_struct; * * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * - * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. + * The default allocator behavior depends on the request size. We have a concept + * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER). + * !costly allocations are too essential to fail so they are implicitly + * non-failing by default (with some exceptions like OOM victims might fail so + * the caller still has to check for failures) while costly requests try to be + * not disruptive and back off even without invoking the OOM killer. + * The following three modifiers might be used to override some of these + * implicit rules + * + * __GFP_NORETRY: The VM implementation will try only very lightweight + * memory direct reclaim to get some memory under memory pressure (thus + * it can sleep). It will avoid disruptive actions like OOM killer. The + * caller must handle the failure which is quite likely to happen under + * heavy memory pressure. The flag is suitable when failure can easily be + * handled at small cost, such as reduced throughput + * + * __GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim + * procedures that have previously failed if there is some indication + * that progress has been made else where. It can wait for other + * tasks to attempt high level approaches to freeing memory such as + * compaction (which removes fragmentation) and page-out. + * There is still a definite limit to the number of retries, but it is + * a larger limit than with __GFP_NORETRY. + * Allocations with this flag may fail, but only when there is + * genuinely little unused memory. While these allocations do not + * directly trigger the OOM killer, their failure indicates that + * the system is likely to need to use the OOM killer soon. The + * caller must handle failure, but can reasonably do so by failing + * a higher-level request, or completing it only in a much less + * efficient manner. + * If the allocation does fail, and the caller is in a position to + * free some non-essential memory, doing so could benefit the system + * as a whole. * * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. New users should be evaluated carefully - * (and the flag should be used only when there is no reasonable failure - * policy) but it is definitely preferable to use the flag rather than - * opencode endless loop around allocator. - * - * __GFP_NORETRY: The VM implementation must not retry indefinitely and will - * return NULL when direct reclaim and memory compaction have failed to allow - * the allocation to succeed. The OOM killer is not called with the current - * implementation. + * cannot handle allocation failures. The allocation could block + * indefinitely but will never return with failure. Testing for + * failure is pointless. + * New users should be evaluated carefully (and the flag should be + * used only when there is no reasonable failure policy) but it is + * definitely preferable to use the flag rather than opencode endless + * loop around allocator. + * Using this flag for costly allocations is _highly_ discouraged. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ #define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) -#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) +#define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL) #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) diff --git a/include/linux/slab.h b/include/linux/slab.h index 04a7f7993e67..41473df6dfb0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -471,7 +471,8 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * * %__GFP_NOWARN - If allocation fails, don't issue any warnings. * - * %__GFP_REPEAT - If allocation fails initially, try once more before failing. + * %__GFP_RETRY_MAYFAIL - Try really hard to succeed the allocation but fail + * eventually. * * There are other flags available as well, but these are not intended * for general use, and so are not documented here. For a full list of diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 10e3663a75a6..8e50d01c645f 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -34,7 +34,7 @@ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ - {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \ + {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e516520433d..bc48ee783dd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1384,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + __GFP_RETRY_MAYFAIL|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); @@ -1525,7 +1525,7 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, { int order = huge_page_order(h); - gfp_mask |= __GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); diff --git a/mm/internal.h b/mm/internal.h index 0e4f558412fb..24d88f084705 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_ATOMIC) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 64b7d82a9b1a..6d30e914afb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3284,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + */ + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; @@ -3413,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, } /* - * !costly requests are much more important than __GFP_REPEAT + * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM * killer to move on while costly can fail and users are ready * to cope with that. 1/4 retries is rather arbitrary but we @@ -3920,9 +3928,9 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_REPEAT + * __GFP_RETRY_MAYFAIL */ - if (costly_order && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a56c3989f773..c50b1a14d55e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (node_state(node, N_HIGH_MEMORY)) page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); else page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); if (page) return page_address(page); diff --git a/mm/util.c b/mm/util.c index 26be6407abd7..6520f2d4a226 100644 --- a/mm/util.c +++ b/mm/util.c @@ -339,7 +339,7 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_RETRY_MAYFAIL * is supported only for large (>32kB) allocations, and it should be used only if * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. * @@ -367,11 +367,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) kmalloc_flags |= __GFP_NOWARN; /* - * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly + * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY for !costly * requests because there is no other way to tell the allocator * that we want to fail rather than retry endlessly. */ - if (!(kmalloc_flags & __GFP_REPEAT) || + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL) || (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) kmalloc_flags |= __GFP_NORETRY; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6016ab079e2b..8698c1c86c4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1795,7 +1795,7 @@ fail: * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted diff --git a/mm/vmscan.c b/mm/vmscan.c index e9210f825219..a1af041930a6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2506,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_REPEAT) { + if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { /* - * For __GFP_REPEAT allocations, stop reclaiming if the + * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the * full LRU list has been scanned and we are still failing * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_REPEAT caller really wants to succeed + * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed */ if (!nr_reclaimed && !nr_scanned) return false; } else { /* - * For non-__GFP_REPEAT allocations which can presumably + * For non-__GFP_RETRY_MAYFAIL allocations which can presumably * fail without consequence, stop if we failed to reclaim * any pages from the last SWAP_CLUSTER_MAX number of * pages that were scanned. This will return to the diff --git a/net/core/dev.c b/net/core/dev.c index 02440518dd69..8515f8fe0460 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7384,7 +7384,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -7424,7 +7424,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -7965,7 +7965,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b11341ed69a..f990eb8b30a9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4747,7 +4747,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, gfp_head = gfp_mask; if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_REPEAT; + gfp_head |= __GFP_RETRY_MAYFAIL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_head); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 147fde73a0f5..263d16e3219e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -648,7 +648,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 0a8a1c45af87..a1497c516d85 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -643,7 +643,7 @@ static const struct { { "__GFP_FS", "F" }, { "__GFP_COLD", "CO" }, { "__GFP_NOWARN", "NWR" }, - { "__GFP_REPEAT", "R" }, + { "__GFP_RETRY_MAYFAIL", "R" }, { "__GFP_NOFAIL", "NF" }, { "__GFP_NORETRY", "NR" }, { "__GFP_COMP", "C" }, -- cgit v1.2.3