From 37815bf866ab6722a47550f8d25ad3f1a16a680c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 9 May 2015 03:06:23 +0930 Subject: module: Call module notifier on failure after complete_formation() The module notifier call chain for MODULE_STATE_COMING was moved up before the parsing of args, into the complete_formation() call. But if the module failed to load after that, the notifier call chain for MODULE_STATE_GOING was never called and that prevented the users of those call chains from cleaning up anything that was allocated. Link: http://lkml.kernel.org/r/554C52B9.9060700@gmail.com Reported-by: Pontus Fuchs Fixes: 4982223e51e8 "module: set nx before marking module MODULE_STATE_COMING" Cc: stable@vger.kernel.org # 3.16+ Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 42a1d2afb217..cfc9e843a924 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3370,6 +3370,9 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + /* we can't deallocate the module until we clear memory protection */ unset_module_init_ro_nx(mod); unset_module_core_ro_nx(mod); -- cgit v1.2.3 From f7bcb70ebae0dcdb5a2d859b09e4465784d99029 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 8 May 2015 13:47:23 -0700 Subject: ktime: Fix ktime_divns to do signed division It was noted that the 32bit implementation of ktime_divns() was doing unsigned division and didn't properly handle negative values. And when a ktime helper was changed to utilize ktime_divns, it caused a regression on some IR blasters. See the following bugzilla for details: https://bugzilla.redhat.com/show_bug.cgi?id=1200353 This patch fixes the problem in ktime_divns by checking and preserving the sign bit, and then reapplying it if appropriate after the division, it also changes the return type to a s64 to make it more obvious this is expected. Nicolas also pointed out that negative dividers would cause infinite loops on 32bit systems, negative dividers is unlikely for users of this function, but out of caution this patch adds checks for negative dividers for both 32-bit (BUG_ON) and 64-bit(WARN_ON) versions to make sure no such use cases creep in. [ tglx: Hand an u64 to do_div() to avoid the compiler warning ] Fixes: 166afb64511e 'ktime: Sanitize ktime_to_us/ms conversion' Reported-and-tested-by: Trevor Cordes Signed-off-by: John Stultz Acked-by: Nicolas Pitre Cc: Ingo Molnar Cc: Josh Boyer Cc: One Thousand Gnomes Cc: Link: http://lkml.kernel.org/r/1431118043-23452-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 27 +++++++++++++++++++++------ kernel/time/hrtimer.c | 14 ++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 5fc3d1083071..2b6a204bd8d4 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -166,19 +166,34 @@ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) } #if BITS_PER_LONG < 64 -extern u64 __ktime_divns(const ktime_t kt, s64 div); -static inline u64 ktime_divns(const ktime_t kt, s64 div) +extern s64 __ktime_divns(const ktime_t kt, s64 div); +static inline s64 ktime_divns(const ktime_t kt, s64 div) { + /* + * Negative divisors could cause an inf loop, + * so bug out here. + */ + BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { - u64 ns = kt.tv64; - do_div(ns, div); - return ns; + s64 ns = kt.tv64; + u64 tmp = ns < 0 ? -ns : ns; + + do_div(tmp, div); + return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (u64)((kt).tv64 / (div)) +static inline s64 ktime_divns(const ktime_t kt, s64 div) +{ + /* + * 32-bit implementation cannot handle negative divisors, + * so catch them on 64bit as well. + */ + WARN_ON(div < 0); + return kt.tv64 / div; +} #endif static inline s64 ktime_to_us(const ktime_t kt) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..93ef7190bdea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -266,21 +266,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 __ktime_divns(const ktime_t kt, s64 div) +s64 __ktime_divns(const ktime_t kt, s64 div) { - u64 dclc; int sft = 0; + s64 dclc; + u64 tmp; dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; + tmp >>= sft; + do_div(tmp, (unsigned long) div); + return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ -- cgit v1.2.3 From 10d784eae2b41e25d8fc6a88096cd27286093c84 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:29 -0700 Subject: sched: always use blk_schedule_flush_plug in io_schedule_out block plug callback could sleep, so we introduce a parameter 'from_schedule' and corresponding drivers can use it to destinguish a schedule plug flush or a plug finish. Unfortunately io_schedule_out still uses blk_flush_plug(). This causes below output (Note, I added a might_sleep() in raid1_unplug to make it trigger faster, but the whole thing doesn't matter if I add might_sleep). In raid1/10, this can cause deadlock. This patch makes io_schedule_out always uses blk_schedule_flush_plug. This should only impact drivers (as far as I know, raid 1/10) which are sensitive to the 'from_schedule' parameter. [ 370.817949] ------------[ cut here ]------------ [ 370.817960] WARNING: CPU: 7 PID: 145 at ../kernel/sched/core.c:7306 __might_sleep+0x7f/0x90() [ 370.817969] do not call blocking ops when !TASK_RUNNING; state=2 set at [] prepare_to_wait+0x2f/0x90 [ 370.817971] Modules linked in: raid1 [ 370.817976] CPU: 7 PID: 145 Comm: kworker/u16:9 Tainted: G W 4.0.0+ #361 [ 370.817977] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014 [ 370.817983] Workqueue: writeback bdi_writeback_workfn (flush-9:1) [ 370.817985] ffffffff81cd83be ffff8800ba8cb298 ffffffff819dd7af 0000000000000001 [ 370.817988] ffff8800ba8cb2e8 ffff8800ba8cb2d8 ffffffff81051afc ffff8800ba8cb2c8 [ 370.817990] ffffffffa00061a8 000000000000041e 0000000000000000 ffff8800ba8cba28 [ 370.817993] Call Trace: [ 370.817999] [] dump_stack+0x4f/0x7b [ 370.818002] [] warn_slowpath_common+0x8c/0xd0 [ 370.818004] [] warn_slowpath_fmt+0x46/0x50 [ 370.818006] [] ? prepare_to_wait+0x2f/0x90 [ 370.818008] [] ? prepare_to_wait+0x2f/0x90 [ 370.818010] [] __might_sleep+0x7f/0x90 [ 370.818014] [] raid1_unplug+0xd3/0x170 [raid1] [ 370.818024] [] blk_flush_plug_list+0x8a/0x1e0 [ 370.818028] [] ? bit_wait+0x50/0x50 [ 370.818031] [] io_schedule_timeout+0x130/0x140 [ 370.818033] [] bit_wait_io+0x36/0x50 [ 370.818034] [] __wait_on_bit+0x65/0x90 [ 370.818041] [] ? ext4_read_block_bitmap_nowait+0xbc/0x630 [ 370.818043] [] ? bit_wait+0x50/0x50 [ 370.818045] [] out_of_line_wait_on_bit+0x72/0x80 [ 370.818047] [] ? autoremove_wake_function+0x40/0x40 [ 370.818050] [] __wait_on_buffer+0x44/0x50 [ 370.818053] [] ext4_wait_block_bitmap+0xe0/0xf0 [ 370.818058] [] ext4_mb_init_cache+0x206/0x790 [ 370.818062] [] ? lru_cache_add+0x1c/0x50 [ 370.818064] [] ext4_mb_init_group+0x11e/0x200 [ 370.818066] [] ext4_mb_load_buddy+0x341/0x360 [ 370.818068] [] ext4_mb_find_by_goal+0x93/0x2f0 [ 370.818070] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818072] [] ext4_mb_regular_allocator+0x67/0x460 [ 370.818074] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818076] [] ext4_mb_new_blocks+0x4cb/0x620 [ 370.818079] [] ext4_ext_map_blocks+0x4c6/0x14d0 [ 370.818081] [] ? ext4_es_lookup_extent+0x4e/0x290 [ 370.818085] [] ext4_map_blocks+0x14d/0x4f0 [ 370.818088] [] ext4_writepages+0x76d/0xe50 [ 370.818094] [] do_writepages+0x21/0x50 [ 370.818097] [] __writeback_single_inode+0x60/0x490 [ 370.818099] [] writeback_sb_inodes+0x2da/0x590 [ 370.818103] [] ? trylock_super+0x1b/0x50 [ 370.818105] [] ? trylock_super+0x1b/0x50 [ 370.818107] [] __writeback_inodes_wb+0x9f/0xd0 [ 370.818109] [] wb_writeback+0x34b/0x3c0 [ 370.818111] [] bdi_writeback_workfn+0x23f/0x550 [ 370.818116] [] process_one_work+0x1c8/0x570 [ 370.818117] [] ? process_one_work+0x14b/0x570 [ 370.818119] [] worker_thread+0x11b/0x470 [ 370.818121] [] ? process_one_work+0x570/0x570 [ 370.818124] [] kthread+0xf8/0x110 [ 370.818126] [] ? kthread_create_on_node+0x210/0x210 [ 370.818129] [] ret_from_fork+0x42/0x70 [ 370.818131] [] ? kthread_create_on_node+0x210/0x210 [ 370.818132] ---[ end trace 7b4deb71e68b6605 ]--- V2: don't change ->in_iowait Cc: NeilBrown Signed-off-by: Shaohua Li Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/sched/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..cfeebb499e79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4387,10 +4387,7 @@ long __sched io_schedule_timeout(long timeout) long ret; current->in_iowait = 1; - if (old_iowait) - blk_schedule_flush_plug(current); - else - blk_flush_plug(current); + blk_schedule_flush_plug(current); delayacct_blkio_start(); rq = raw_rq(); -- cgit v1.2.3 From 1173ff09b9c57be8248427b7be161f7599dccd6b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 19 May 2015 09:07:27 +0200 Subject: watchdog: fix double lock in watchdog_nmi_enable_all Commit ab992dc38f9a ("watchdog: Fix merge 'conflict'") has introduced an obvious deadlock because of a typo. watchdog_proc_mutex should be unlocked on exit. Thanks to Miroslav Benes who was staring at the code with me and noticed this. Signed-off-by: Michal Hocko Duh-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 506edcc500c4..581a68a04c64 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -621,7 +621,7 @@ void watchdog_nmi_enable_all(void) put_online_cpus(); unlock: - mutex_lock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_proc_mutex); } void watchdog_nmi_disable_all(void) -- cgit v1.2.3 From dead9f29ddcc69551f35529a252d2704047870d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 May 2015 12:15:21 -0700 Subject: perf: Fix race in BPF program unregister there is a race between perf_event_free_bpf_prog() and free_trace_kprobe(): __free_event() event->destroy(event) tp_perf_event_destroy() perf_trace_destroy() perf_trace_event_unreg() which is dropping event->tp_event->perf_refcount and allows to proceed in: unregister_trace_kprobe() unregister_kprobe_event() trace_remove_event_call() probe_remove_event_call() free_trace_kprobe() while __free_event does: call_rcu(&event->rcu_head, free_event_rcu); free_event_rcu() perf_event_free_bpf_prog() To fix the race simply move perf_event_free_bpf_prog() before event->destroy(), since event->tp_event is still valid at that point. Note, perf_trace_destroy() is not racing with trace_remove_event_call() since they both grab event_mutex. Reported-by: Wang Nan Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lizefan@huawei.com Cc: pi3orama@163.com Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Link: http://lkml.kernel.org/r/1431717321-28772-1-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1a3bf48743ce..eddf1ed4155e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3442,7 +3442,6 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - perf_event_free_bpf_prog(event); kfree(event); } @@ -3573,6 +3572,8 @@ static void __free_event(struct perf_event *event) put_callchain_buffers(); } + perf_event_free_bpf_prog(event); + if (event->destroy) event->destroy(event); -- cgit v1.2.3 From aa319bcd366349c6f72fcd331da89d3d06090651 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 22 May 2015 18:30:20 +0300 Subject: perf: Disallow sparse AUX allocations for non-SG PMUs in overwrite mode PMUs that don't support hardware scatter tables require big contiguous chunks of memory and a PMI to switch between them. However, in overwrite using a PMI for this purpose adds extra overhead that the users would like to avoid. Thus, in overwrite mode for such PMUs we can only allow one contiguous chunk for the entire requested buffer. This patch changes the behavior accordingly, so that if the buddy allocator fails to come up with a single high-order chunk for the entire requested buffer, the allocation will fail. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1432308626-18845-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 232f00f273cb..725c416085e3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } + /* + * In overwrite mode, PMUs that don't support SG may not handle more + * than one contiguous allocation, since they rely on PMI to do double + * buffering. In this case, the entire buffer has to be one contiguous + * chunk. + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && + overwrite) { + struct page *page = virt_to_page(rb->aux_pages[0]); + + if (page_private(page) != max_order) + goto out; + } + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) -- cgit v1.2.3 From 9b7b819ca1e508195feed5ece558dca66adeef05 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 4 Jun 2015 23:57:18 +0200 Subject: compat: cleanup coding in compat_get_bitmap() and compat_put_bitmap() In the functions compat_get_bitmap() and compat_put_bitmap() the variable nr_compat_longs stores how many compat_ulong_t words should be copied in a loop. The copy loop itself is this: if (nr_compat_longs-- > 0) { if (__get_user(um, umask)) return -EFAULT; } else { um = 0; } Since nr_compat_longs gets unconditionally decremented in each loop and since it's type is unsigned this could theoretically lead to out of bounds accesses to userspace if nr_compat_longs wraps around to (unsigned)(-1). Although the callers currently do not trigger out-of-bounds accesses, we should better implement the loop in a safe way to completely avoid such warp-arounds. Signed-off-by: Helge Deller Cc: Linus Torvalds Cc: Al Viro --- kernel/compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 24f00610c575..333d364be29d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, * bitmap. We must however ensure the end of the * kernel bitmap is zeroed. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__get_user(um, umask)) return -EFAULT; } else { @@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, * We dont want to write past the end of the userspace * bitmap. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__put_user(um, umask)) return -EFAULT; } -- cgit v1.2.3