From e8f241893dfbbebe2813c01eac54f263e6a5e59c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 10:53:45 +0100 Subject: genirq: Restore trigger settings in irq_modify_status() irq_modify_status starts by clearing the trigger settings from irq_data before applying the new settings, but doesn't restore them, leaving them to IRQ_TYPE_NONE. That's pretty confusing to the potential request_irq() that could follow. Instead, snapshot the settings before clearing them, and restore them if the irq_modify_status() invocation was not changing the trigger. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-and-tested-by: jeffy Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170818095345.12378-1-marc.zyngier@arm.com --- kernel/irq/chip.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e..3675c6004f2a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } -- cgit v1.2.3 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- arch/x86/Kconfig | 1 + include/linux/nmi.h | 8 +++++++ kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 7 ++++++ 5 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..9101bfc85539 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8aa01fd859fb..a36abe2da13e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace; #define sysctl_softlockup_all_cpu_backtrace 0 #define sysctl_hardlockup_all_cpu_backtrace 0 #endif + +#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ + defined(CONFIG_HARDLOCKUP_DETECTOR) +void watchdog_update_hrtimer_threshold(u64 period); +#else +static inline void watchdog_update_hrtimer_threshold(u64 period) { } +#endif + extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715522e8..c617b9d1d6cb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -797,6 +797,13 @@ config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR +# +# Enables a timestamp based low pass filter to compensate for perf based +# hard lockup detection which runs too fast due to turbo modes. +# +config HARDLOCKUP_CHECK_TIMESTAMP + bool + # # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard # lockup detector rather than the perf based detector. -- cgit v1.2.3 From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:58 -0700 Subject: kmod: fix wait on recursive loop Recursive loops with module loading were previously handled in kmod by restricting the number of modprobe calls to 50 and if that limit was breached request_module() would return an error and a user would see the following on their kernel dmesg: request_module: runaway loop modprobe binfmt-464c Starting init:/sbin/init exists but couldn't execute it (error -8) This issue could happen for instance when a 64-bit kernel boots a 32-bit userspace on some architectures and has no 32-bit binary format hanlders. This is visible, for instance, when a CONFIG_MODULES enabled 64-bit MIPS kernel boots a into o32 root filesystem and the binfmt handler for o32 binaries is not built-in. After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now don't have any visible signs of an error and the kernel just waits for the loop to end somehow. Although this *particular* recursive loop could also be addressed by doing a sanity check on search_binary_handler() and disallowing a modular binfmt to be required for modprobe, a generic solution for any recursive kernel kmod issues is still needed. This should catch these loops. We can investigate each loop and address each one separately as they come in, this however puts a stop gap for them as before. Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org Fixes: 6d7964a722af ("kmod: throttle kmod thread limit") Signed-off-by: Luis R. Rodriguez Reported-by: Matt Redfearn Tested-by: Matt Redfearn Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: Daniel Mentz Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem); static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); +/* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + /* modprobe_path is set via /proc/sys. */ @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); -- cgit v1.2.3 From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 18 Aug 2017 15:16:18 -0700 Subject: signal: don't remove SIGNAL_UNKILLABLE for traced tasks. When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive faults, but this is undesirable when tracing. For example, debugging an init process (whether global or namespace), hitting a breakpoint and SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE. Everything continues fine, but then once debugging has finished, the init process is left killable which is unlikely what the user expects, resulting in either an accidentally killed init or an init that stops reaping zombies. Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.2.3 From 8fbbe2d7cc478d1544f41f2271787c993c23a4f6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Aug 2017 12:57:51 +0300 Subject: genirq/ipi: Fixup checks against nr_cpu_ids Valid CPU ids are [0, nr_cpu_ids-1] inclusive. Fixes: 3b8e29a82dd1 ("genirq: Implement ipi_send_mask/single()") Fixes: f9bce791ae2a ("genirq: Add a new function to get IPI reverse mapping") Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170819095751.GB27864@avx2 --- kernel/irq/ipi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { -- cgit v1.2.3 From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/pid.h | 4 +++- include/linux/sched.h | 51 +++++++++++++++++++++++++++------------------------ kernel/pid.c | 11 ++++------- 3 files changed, 34 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e431..719582744a2e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -8,7 +8,9 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, PIDTYPE_SID, - PIDTYPE_MAX + PIDTYPE_MAX, + /* only valid to __task_pid_nr_ns() */ + __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..c05ac5f5aa03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) return tsk->tgid; } -extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p) return p->pids[PIDTYPE_PID].pid != NULL; } -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); @@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.2.3