From 4bb0f0e73c8c30917d169c4a0f1ac083690c545b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 1 Aug 2017 12:01:52 -0400
Subject: tracing: Call clear_boot_tracer() at lateinit_sync

The clear_boot_tracer function is used to reset the default_bootup_tracer
string to prevent it from being accessed after boot, as it originally points
to init data. But since clear_boot_tracer() is called via the
init_lateinit() call, it races with the initcall for registering the hwlat
tracer. If someone adds "ftrace=hwlat" to the kernel command line, depending
on how the linker sets up the text, the saved command line may be cleared,
and the hwlat tracer never is initialized.

Simply have the clear_boot_tracer() be called by initcall_lateinit_sync() as
that's for tasks to be called after lateinit.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196551

Cc: stable@vger.kernel.org
Fixes: e7c15cd8a ("tracing: Added hardware latency tracer")
Reported-by: Zamir SUN <sztsian@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42b9355033d4..784fb43b2abe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8407,4 +8407,4 @@ __init static int clear_boot_tracer(void)
 }
 
 fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
-- 
cgit v1.2.3


From 147d88e0b5eb90191bc5c12ca0a3c410b75a13d2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 1 Aug 2017 14:02:01 +0300
Subject: tracing: Missing error code in tracer_alloc_buffers()

If ring_buffer_alloc() or one of the next couple function calls fail
then we should return -ENOMEM but the current code returns success.

Link: http://lkml.kernel.org/r/20170801110201.ajdkct7vwzixahvx@mwanda

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: b32614c03413 ('tracing/rb: Convert to hotplug state machine')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 784fb43b2abe..d815fc317e9d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8293,6 +8293,7 @@ __init static int tracer_alloc_buffers(void)
 	if (ret < 0)
 		goto out_free_cpumask;
 	/* Used for event triggers */
+	ret = -ENOMEM;
 	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
 	if (!temp_buffer)
 		goto out_rm_hp_state;
-- 
cgit v1.2.3


From a7e52ad7ed82e21273eccff93d1477a7b313aabb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 2 Aug 2017 14:20:54 -0400
Subject: ring-buffer: Have ring_buffer_alloc_read_page() return error on
 offline CPU

Chunyu Hu reported:
  "per_cpu trace directories and files are created for all possible cpus,
   but only the cpus which have ever been on-lined have their own per cpu
   ring buffer (allocated by cpuhp threads). While trace_buffers_open, the
   open handler for trace file 'trace_pipe_raw' is always trying to access
   field of ring_buffer_per_cpu, and would panic with the NULL pointer.

   Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV
   when openning it if that cpu does not have trace ring buffer.

   Reproduce:
   cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw
   (cpu31 is never on-lined, this is a 16 cores x86_64 box)

   Tested with:
   1) boot with maxcpus=14, read trace_pipe_raw of cpu15.
      Got -NODEV.
   2) oneline cpu15, read trace_pipe_raw of cpu15.
      Get the raw trace data.

   Call trace:
   [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0
   [ 5760.961678]  tracing_buffers_read+0x1f6/0x230
   [ 5760.962695]  __vfs_read+0x37/0x160
   [ 5760.963498]  ? __vfs_read+0x5/0x160
   [ 5760.964339]  ? security_file_permission+0x9d/0xc0
   [ 5760.965451]  ? __vfs_read+0x5/0x160
   [ 5760.966280]  vfs_read+0x8c/0x130
   [ 5760.967070]  SyS_read+0x55/0xc0
   [ 5760.967779]  do_syscall_64+0x67/0x150
   [ 5760.968687]  entry_SYSCALL64_slow_path+0x25/0x25"

This was introduced by the addition of the feature to reuse reader pages
instead of re-allocating them. The problem is that the allocation of a
reader page (which is per cpu) does not check if the cpu is online and set
up for the ring buffer.

Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer")
Reported-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c           | 14 +++++++++-----
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 | 16 +++++++++++-----
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 529cc50d7243..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * the page that was allocated, with the read page of the buffer.
  *
  * Returns:
- *  The page allocated, or NULL on error.
+ *  The page allocated, or ERR_PTR
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_per_cpu *cpu_buffer;
 	struct buffer_data_page *bpage = NULL;
 	unsigned long flags;
 	struct page *page;
 
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return ERR_PTR(-ENODEV);
+
+	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	arch_spin_lock(&cpu_buffer->lock);
 
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 	page = alloc_pages_node(cpu_to_node(cpu),
 				GFP_KERNEL | __GFP_NORETRY, 0);
 	if (!page)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	bpage = page_address(page);
 
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *
  * for example:
  *	rpage = ring_buffer_alloc_read_page(buffer, cpu);
- *	if (!rpage)
- *		return error;
+ *	if (IS_ERR(rpage))
+ *		return PTR_ERR(rpage);
  *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
 	int i;
 
 	bpage = ring_buffer_alloc_read_page(buffer, cpu);
-	if (!bpage)
+	if (IS_ERR(bpage))
 		return EVENT_DROPPED;
 
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d815fc317e9d..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 {
 	struct ftrace_buffer_info *info = filp->private_data;
 	struct trace_iterator *iter = &info->iter;
-	ssize_t ret;
+	ssize_t ret = 0;
 	ssize_t size;
 
 	if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
 							  iter->cpu_file);
-		info->spare_cpu = iter->cpu_file;
+		if (IS_ERR(info->spare)) {
+			ret = PTR_ERR(info->spare);
+			info->spare = NULL;
+		} else {
+			info->spare_cpu = iter->cpu_file;
+		}
 	}
 	if (!info->spare)
-		return -ENOMEM;
+		return ret;
 
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		ref->ref = 1;
 		ref->buffer = iter->trace_buffer->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
-		if (!ref->page) {
-			ret = -ENOMEM;
+		if (IS_ERR(ref->page)) {
+			ret = PTR_ERR(ref->page);
+			ref->page = NULL;
 			kfree(ref);
 			break;
 		}
-- 
cgit v1.2.3


From 33ba43ed0afc13a29b1314e3e45a9938d310ba13 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 23 Aug 2017 00:06:09 +0200
Subject: bpf: fix map value attribute for hash of maps

Currently, iproute2's BPF ELF loader works fine with array of maps
when retrieving the fd from a pinned node and doing a selfcheck
against the provided map attributes from the object file, but we
fail to do the same for hash of maps and thus refuse to get the
map from pinned node.

Reason is that when allocating hash of maps, fd_htab_map_alloc() will
set the value size to sizeof(void *), and any user space map creation
requests are forced to set 4 bytes as value size. Thus, selfcheck
will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from
object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID
returns the value size used to create the map.

Fix it by handling it the same way as we do for array of maps, which
means that we leave value size at 4 bytes and in the allocation phase
round up value size to 8 bytes. alloc_htab_elem() needs an adjustment
in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem()
calling into htab_map_update_elem() with the pointer of the map
pointer as value. Unlike array of maps where we just xchg(), we're
using the generic htab_map_update_elem() callback also used from helper
calls, which published the key/value already on return, so we need
to ensure to memcpy() the right size.

Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb463172aa8..d11c8181f4c5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 	}
 }
 
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+	       BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+	u32 size = htab->map.value_size;
+
+	if (percpu || fd_htab_map_needs_adjust(htab))
+		size = round_up(size, 8);
+	return size;
+}
+
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
 					 bool percpu, bool onallcpus,
 					 struct htab_elem *old_elem)
 {
-	u32 size = htab->map.value_size;
+	u32 size = htab_size_value(htab, percpu);
 	bool prealloc = htab_is_prealloc(htab);
 	struct htab_elem *l_new, **pl_new;
 	void __percpu *pptr;
@@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
-		/* round up value_size to 8 bytes */
-		size = round_up(size, 8);
-
 		if (prealloc) {
 			pptr = htab_elem_get_ptr(l_new, key_size);
 		} else {
@@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 
 static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_map *map;
-
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
-
-	/* pointer is stored internally */
-	attr->value_size = sizeof(void *);
-	map = htab_map_alloc(attr);
-	attr->value_size = sizeof(u32);
-
-	return map;
+	return htab_map_alloc(attr);
 }
 
 static void fd_htab_map_free(struct bpf_map *map)
-- 
cgit v1.2.3


From 2fe59f507a65dbd734b990a11ebc7488f6f87a24 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 22 Aug 2017 18:43:48 +1000
Subject: timers: Fix excessive granularity of new timers after a nohz idle

When a timer base is idle, it is forwarded when a new timer is added
to ensure that granularity does not become excessive. When not idle,
the timer tick is expected to increment the base.

However there are several problems:

- If an existing timer is modified, the base is forwarded only after
  the index is calculated.

- The base is not forwarded by add_timer_on.

- There is a window after a timer is restarted from a nohz idle, after
  it is marked not-idle and before the timer tick on this CPU, where a
  timer may be added but the ancient base does not get forwarded.

These result in excessive granularity (a 1 jiffy timeout can blow out
to 100s of jiffies), which cause the rcu lockup detector to trigger,
among other things.

Fix this by keeping track of whether the timer base has been idle
since it was last run or forwarded, and if so then forward it before
adding a new timer.

There is still a case where mod_timer optimises the case of a pending
timer mod with the same expiry time, where the timer can see excessive
granularity relative to the new, shorter interval. A comment is added,
but it's not changed because it is an important fastpath for
networking.

This has been tested and found to fix the RCU softlockup messages.

Testing was also done with tracing to measure requested versus
achieved wakeup latencies for all non-deferrable timers in an idle
system (with no lockup watchdogs running). Wakeup latency relative to
absolute latency is calculated (note this suffers from round-up skew
at low absolute times) and analysed:

             max     avg      std
upstream   506.0    1.20     4.68
patched      2.0    1.08     0.15

The bug was noticed due to the lockup detector Kconfig changes
dropping it out of people's .configs and resulting in larger base
clk skew When the lockup detectors are enabled, no CPU can go idle for
longer than 4 seconds, which limits the granularity errors.
Sub-optimal timer behaviour is observable on a smaller scale in that
case:

	     max     avg      std
upstream     9.0    1.05     0.19
patched      2.0    1.04     0.11

Fixes: Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: David Miller <davem@davemloft.net>
Cc: dzickus@redhat.com
Cc: sfr@canb.auug.org.au
Cc: mpe@ellerman.id.au
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: linuxarm@huawei.com
Cc: abdhalee@linux.vnet.ibm.com
Cc: John Stultz <john.stultz@linaro.org>
Cc: akpm@linux-foundation.org
Cc: paulmck@linux.vnet.ibm.com
Cc: torvalds@linux-foundation.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com
---
 kernel/time/timer.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8f5d1bf18854..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
 	bool			migration_enabled;
 	bool			nohz_active;
 	bool			is_idle;
+	bool			must_forward_clk;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
 
 static inline void forward_timer_base(struct timer_base *base)
 {
-	unsigned long jnow = READ_ONCE(jiffies);
+	unsigned long jnow;
 
 	/*
-	 * We only forward the base when it's idle and we have a delta between
-	 * base clock and jiffies.
+	 * We only forward the base when we are idle or have just come out of
+	 * idle (must_forward_clk logic), and have a delta between base clock
+	 * and jiffies. In the common case, run_timers will take care of it.
 	 */
-	if (!base->is_idle || (long) (jnow - base->clk) < 2)
+	if (likely(!base->must_forward_clk))
+		return;
+
+	jnow = READ_ONCE(jiffies);
+	base->must_forward_clk = base->is_idle;
+	if ((long)(jnow - base->clk) < 2)
 		return;
 
 	/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	 * same array bucket then just return:
 	 */
 	if (timer_pending(timer)) {
+		/*
+		 * The downside of this optimization is that it can result in
+		 * larger granularity than you would get from adding a new
+		 * timer with this expiry.
+		 */
 		if (timer->expires == expires)
 			return 1;
 
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 * dequeue/enqueue dance.
 		 */
 		base = lock_timer_base(timer, &flags);
+		forward_timer_base(base);
 
 		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		}
 	} else {
 		base = lock_timer_base(timer, &flags);
+		forward_timer_base(base);
 	}
 
 	ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 			raw_spin_lock(&base->lock);
 			WRITE_ONCE(timer->flags,
 				   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
+			forward_timer_base(base);
 		}
 	}
 
-	/* Try to forward a stale timer base clock */
-	forward_timer_base(base);
-
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 		WRITE_ONCE(timer->flags,
 			   (timer->flags & ~TIMER_BASEMASK) | cpu);
 	}
+	forward_timer_base(base);
 
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		if (!is_max_delta)
 			expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
 		/*
-		 * If we expect to sleep more than a tick, mark the base idle:
+		 * If we expect to sleep more than a tick, mark the base idle.
+		 * Also the tick is stopped so any added timer must forward
+		 * the base clk itself to keep granularity small. This idle
+		 * logic is only maintained for the BASE_STD base, deferrable
+		 * timers may still see large granularity skew (by design).
 		 */
-		if ((expires - basem) > TICK_NSEC)
+		if ((expires - basem) > TICK_NSEC) {
+			base->must_forward_clk = true;
 			base->is_idle = true;
+		}
 	}
 	raw_spin_unlock(&base->lock);
 
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 {
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
+	/*
+	 * must_forward_clk must be cleared before running timers so that any
+	 * timer functions that call mod_timer will not try to forward the
+	 * base. idle trcking / clock forwarding logic is only used with
+	 * BASE_STD timers.
+	 *
+	 * The deferrable base does not do idle tracking at all, so we do
+	 * not forward it. This can result in very large variations in
+	 * granularity for deferrable timers, but they can be deferred for
+	 * long periods due to idle.
+	 */
+	base->must_forward_clk = false;
+
 	__run_timers(base);
 	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
 		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
-- 
cgit v1.2.3


From a8f0f9e49956a74718874b800251455680085600 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 17 Aug 2017 16:37:25 -0400
Subject: ftrace: Check for null ret_stack on profile function graph entry
 function

There's a small race when function graph shutsdown and the calling of the
registered function graph entry callback. The callback must not reference
the task's ret_stack without first checking that it is not NULL. Note, when
a ret_stack is allocated for a task, it stays allocated until the task exits.
The problem here, is that function_graph is shutdown, and a new task was
created, which doesn't have its ret_stack allocated. But since some of the
functions are still being traced, the callbacks can still be called.

The normal function_graph code handles this, but starting with commit
8861dd303c ("ftrace: Access ret_stack->subtime only in the function
profiler") the profiler code references the ret_stack on function entry, but
doesn't check if it is NULL first.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196611

Cc: stable@vger.kernel.org
Fixes: 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler")
Reported-by: lilydjwg@gmail.com
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02004ae91860..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 
 	function_profile_call(trace->func, 0, NULL, NULL);
 
+	/* If function graph is shutting down, ret_stack can be NULL */
+	if (!current->ret_stack)
+		return 0;
+
 	if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
 		current->ret_stack[index].subtime = 0;
 
-- 
cgit v1.2.3


From 475bb3c69ab05df2a6ecef6acc2393703d134180 Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Mon, 14 Aug 2017 18:18:17 +0800
Subject: tracing: Fix kmemleak in tracing_map_array_free()

kmemleak reported the below leak when I was doing clear of the hist
trigger. With this patch, the kmeamleak is gone.

unreferenced object 0xffff94322b63d760 (size 32):
  comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s)
  hex dump (first 32 bytes):
    00 01 00 00 04 00 00 00 08 00 00 00 ff 00 00 00  ................
    10 00 00 00 00 00 00 00 80 a8 7a f2 31 94 ff ff  ..........z.1...
  backtrace:
    [<ffffffff9e96c27a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff9e424cba>] kmem_cache_alloc_trace+0xca/0x1d0
    [<ffffffff9e377736>] tracing_map_array_alloc+0x26/0x140
    [<ffffffff9e261be0>] kretprobe_trampoline+0x0/0x50
    [<ffffffff9e38b935>] create_hist_data+0x535/0x750
    [<ffffffff9e38bd47>] event_hist_trigger_func+0x1f7/0x420
    [<ffffffff9e38893d>] event_trigger_write+0xfd/0x1a0
    [<ffffffff9e44dfc7>] __vfs_write+0x37/0x170
    [<ffffffff9e44f552>] vfs_write+0xb2/0x1b0
    [<ffffffff9e450b85>] SyS_write+0x55/0xc0
    [<ffffffff9e203857>] do_syscall_64+0x67/0x150
    [<ffffffff9e977ce7>] return_from_SYSCALL_64+0x0/0x6a
    [<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffff9431f27aa880 (size 128):
  comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s)
  hex dump (first 32 bytes):
    00 00 8c 2a 32 94 ff ff 00 f0 8b 2a 32 94 ff ff  ...*2......*2...
    00 e0 8b 2a 32 94 ff ff 00 d0 8b 2a 32 94 ff ff  ...*2......*2...
  backtrace:
    [<ffffffff9e96c27a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff9e425348>] __kmalloc+0xe8/0x220
    [<ffffffff9e3777c1>] tracing_map_array_alloc+0xb1/0x140
    [<ffffffff9e261be0>] kretprobe_trampoline+0x0/0x50
    [<ffffffff9e38b935>] create_hist_data+0x535/0x750
    [<ffffffff9e38bd47>] event_hist_trigger_func+0x1f7/0x420
    [<ffffffff9e38893d>] event_trigger_write+0xfd/0x1a0
    [<ffffffff9e44dfc7>] __vfs_write+0x37/0x170
    [<ffffffff9e44f552>] vfs_write+0xb2/0x1b0
    [<ffffffff9e450b85>] SyS_write+0x55/0xc0
    [<ffffffff9e203857>] do_syscall_64+0x67/0x150
    [<ffffffff9e977ce7>] return_from_SYSCALL_64+0x0/0x6a
    [<ffffffffffffffff>] 0xffffffffffffffff

Link: http://lkml.kernel.org/r/1502705898-27571-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 08d43a5fa063 ("tracing: Add lock-free tracing_map")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
 	if (!a)
 		return;
 
-	if (!a->pages) {
-		kfree(a);
-		return;
-	}
+	if (!a->pages)
+		goto free;
 
 	for (i = 0; i < a->n_pages; i++) {
 		if (!a->pages[i])
 			break;
 		free_page((unsigned long)a->pages[i]);
 	}
+
+	kfree(a->pages);
+
+ free:
+	kfree(a);
 }
 
 struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
-- 
cgit v1.2.3


From 8b0db1a5bdfcee0dbfa89607672598ae203c9045 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 23 Aug 2017 12:46:27 -0400
Subject: tracing: Fix freeing of filter in create_filter() when set_str is
 false

Performing the following task with kmemleak enabled:

 # cd /sys/kernel/tracing/events/irq/irq_handler_entry/
 # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger
 # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger
 # echo scan > /sys/kernel/debug/kmemleak
 # cat /sys/kernel/debug/kmemleak
unreferenced object 0xffff8800b9290308 (size 32):
  comm "bash", pid 1114, jiffies 4294848451 (age 141.139s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81cef5aa>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff81357938>] kmem_cache_alloc_trace+0x158/0x290
    [<ffffffff81261c09>] create_filter_start.constprop.28+0x99/0x940
    [<ffffffff812639c9>] create_filter+0xa9/0x160
    [<ffffffff81263bdc>] create_event_filter+0xc/0x10
    [<ffffffff812655e5>] set_trigger_filter+0xe5/0x210
    [<ffffffff812660c4>] event_enable_trigger_func+0x324/0x490
    [<ffffffff812652e2>] event_trigger_write+0x1a2/0x260
    [<ffffffff8138cf87>] __vfs_write+0xd7/0x380
    [<ffffffff8138f421>] vfs_write+0x101/0x260
    [<ffffffff8139187b>] SyS_write+0xab/0x130
    [<ffffffff81cfd501>] entry_SYSCALL_64_fastpath+0x1f/0xbe
    [<ffffffffffffffff>] 0xffffffffffffffff

The function create_filter() is passed a 'filterp' pointer that gets
allocated, and if "set_str" is true, it is up to the caller to free it, even
on error. The problem is that the pointer is not freed by create_filter()
when set_str is false. This is a bug, and it is not up to the caller to free
the filter on error if it doesn't care about the string.

Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 38b78eb85 ("tracing: Factorize filter creation")
Reported-by: Chunyu Hu <chuhu@redhat.com>
Tested-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
 		if (err && set_str)
 			append_filter_err(ps, filter);
 	}
+	if (err && !set_str) {
+		free_event_filter(filter);
+		filter = NULL;
+	}
 	create_filter_finish(ps);
 
 	*filterp = filter;
-- 
cgit v1.2.3


From 1c08c22c874ac88799cab1f78c40f46110274915 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 24 Aug 2017 12:04:29 -0400
Subject: cpuset: Fix incorrect memory_pressure control file mapping

The memory_pressure control file was incorrectly set up without
a private value (0, by default). As a result, this control
file was treated like memory_migrate on read. By adding back the
FILE_MEMORY_PRESSURE private value, the correct memory pressure value
will be returned.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 7dbdb199d3bf ("cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE")
Cc: stable@vger.kernel.org # v4.4+
---
 kernel/cgroup/cpuset.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e5008c..8362bac0d179 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1891,6 +1891,7 @@ static struct cftype files[] = {
 	{
 		.name = "memory_pressure",
 		.read_u64 = cpuset_read_u64,
+		.private = FILE_MEMORY_PRESSURE,
 	},
 
 	{
-- 
cgit v1.2.3


From 64aee2a965cf2954a038b5522f11d2cd2f0f8f3e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 22 Jun 2017 15:41:38 +0100
Subject: perf/core: Fix group {cpu,task} validation

Regardless of which events form a group, it does not make sense for the
events to target different tasks and/or CPUs, as this leaves the group
inconsistent and impossible to schedule. The core perf code assumes that
these are consistent across (successfully intialised) groups.

Core perf code only verifies this when moving SW events into a HW
context. Thus, we can violate this requirement for pure SW groups and
pure HW groups, unless the relevant PMU driver happens to perform this
verification itself. These mismatched groups subsequently wreak havoc
elsewhere.

For example, we handle watchpoints as SW events, and reserve watchpoint
HW on a per-CPU basis at pmu::event_init() time to ensure that any event
that is initialised is guaranteed to have a slot at pmu::add() time.
However, the core code only checks the group leader's cpu filter (via
event_filter_match()), and can thus install follower events onto CPUs
violating thier (mismatched) CPU filters, potentially installing them
into a CPU without sufficient reserved slots.

This can be triggered with the below test case, resulting in warnings
from arch backends.

  #define _GNU_SOURCE
  #include <linux/hw_breakpoint.h>
  #include <linux/perf_event.h>
  #include <sched.h>
  #include <stdio.h>
  #include <sys/prctl.h>
  #include <sys/syscall.h>
  #include <unistd.h>

  static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
			   int group_fd, unsigned long flags)
  {
	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
  }

  char watched_char;

  struct perf_event_attr wp_attr = {
	.type = PERF_TYPE_BREAKPOINT,
	.bp_type = HW_BREAKPOINT_RW,
	.bp_addr = (unsigned long)&watched_char,
	.bp_len = 1,
	.size = sizeof(wp_attr),
  };

  int main(int argc, char *argv[])
  {
	int leader, ret;
	cpu_set_t cpus;

	/*
	 * Force use of CPU0 to ensure our CPU0-bound events get scheduled.
	 */
	CPU_ZERO(&cpus);
	CPU_SET(0, &cpus);
	ret = sched_setaffinity(0, sizeof(cpus), &cpus);
	if (ret) {
		printf("Unable to set cpu affinity\n");
		return 1;
	}

	/* open leader event, bound to this task, CPU0 only */
	leader = perf_event_open(&wp_attr, 0, 0, -1, 0);
	if (leader < 0) {
		printf("Couldn't open leader: %d\n", leader);
		return 1;
	}

	/*
	 * Open a follower event that is bound to the same task, but a
	 * different CPU. This means that the group should never be possible to
	 * schedule.
	 */
	ret = perf_event_open(&wp_attr, 0, 1, leader, 0);
	if (ret < 0) {
		printf("Couldn't open mismatched follower: %d\n", ret);
		return 1;
	} else {
		printf("Opened leader/follower with mismastched CPUs\n");
	}

	/*
	 * Open as many independent events as we can, all bound to the same
	 * task, CPU0 only.
	 */
	do {
		ret = perf_event_open(&wp_attr, 0, 0, -1, 0);
	} while (ret >= 0);

	/*
	 * Force enable/disble all events to trigger the erronoeous
	 * installation of the follower event.
	 */
	printf("Opened all events. Toggling..\n");
	for (;;) {
		prctl(PR_TASK_PERF_EVENTS_DISABLE, 0, 0, 0, 0);
		prctl(PR_TASK_PERF_EVENTS_ENABLE, 0, 0, 0, 0);
	}

	return 0;
  }

Fix this by validating this requirement regardless of whether we're
moving events.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zhou Chengming <zhouchengming1@huawei.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1498142498-15758-1-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ee20d4c546b5..3504125871d2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10032,28 +10032,27 @@ SYSCALL_DEFINE5(perf_event_open,
 			goto err_context;
 
 		/*
-		 * Do not allow to attach to a group in a different
-		 * task or CPU context:
+		 * Make sure we're both events for the same CPU;
+		 * grouping events for different CPUs is broken; since
+		 * you can never concurrently schedule them anyhow.
 		 */
-		if (move_group) {
-			/*
-			 * Make sure we're both on the same task, or both
-			 * per-cpu events.
-			 */
-			if (group_leader->ctx->task != ctx->task)
-				goto err_context;
+		if (group_leader->cpu != event->cpu)
+			goto err_context;
 
-			/*
-			 * Make sure we're both events for the same CPU;
-			 * grouping events for different CPUs is broken; since
-			 * you can never concurrently schedule them anyhow.
-			 */
-			if (group_leader->cpu != event->cpu)
-				goto err_context;
-		} else {
-			if (group_leader->ctx != ctx)
-				goto err_context;
-		}
+		/*
+		 * Make sure we're both on the same task, or both
+		 * per-CPU events.
+		 */
+		if (group_leader->ctx->task != ctx->task)
+			goto err_context;
+
+		/*
+		 * Do not allow to attach to a group in a different task
+		 * or CPU context. If we're moving SW events, we'll fix
+		 * this up later, so allow that.
+		 */
+		if (!move_group && group_leader->ctx != ctx)
+			goto err_context;
 
 		/*
 		 * Only a group leader can be exclusive or pinned
-- 
cgit v1.2.3


From 2b7e8665b4ff51c034c55df3cff76518d1a9ee3a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 25 Aug 2017 15:55:43 -0700
Subject: fork: fix incorrect fput of ->exe_file causing use-after-free

Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for
write killable") made it possible to kill a forking task while it is
waiting to acquire its ->mmap_sem for write, in dup_mmap().

However, it was overlooked that this introduced an new error path before
a reference is taken on the mm_struct's ->exe_file.  Since the
->exe_file of the new mm_struct was already set to the old ->exe_file by
the memcpy() in dup_mm(), it was possible for the mmput() in the error
path of dup_mm() to drop a reference to ->exe_file which was never
taken.

This caused the struct file to later be freed prematurely.

Fix it by updating mm_init() to NULL out the ->exe_file, in the same
place it clears other things like the list of mmaps.

This bug was found by syzkaller.  It can be reproduced using the
following C program:

    #define _GNU_SOURCE
    #include <pthread.h>
    #include <stdlib.h>
    #include <sys/mman.h>
    #include <sys/syscall.h>
    #include <sys/wait.h>
    #include <unistd.h>

    static void *mmap_thread(void *_arg)
    {
        for (;;) {
            mmap(NULL, 0x1000000, PROT_READ,
                 MAP_POPULATE|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
        }
    }

    static void *fork_thread(void *_arg)
    {
        usleep(rand() % 10000);
        fork();
    }

    int main(void)
    {
        fork();
        fork();
        fork();
        for (;;) {
            if (fork() == 0) {
                pthread_t t;

                pthread_create(&t, NULL, mmap_thread, NULL);
                pthread_create(&t, NULL, fork_thread, NULL);
                usleep(rand() % 10000);
                syscall(__NR_exit_group, 0);
            }
            wait(NULL);
        }
    }

No special kernel config options are needed.  It usually causes a NULL
pointer dereference in __remove_shared_vm_struct() during exit, or in
dup_mmap() (which is usually inlined into copy_process()) during fork.
Both are due to a vm_area_struct's ->vm_file being used after it's
already been freed.

Google Bug Id: 64772007

Link: http://lkml.kernel.org/r/20170823211408.31198-1-ebiggers3@gmail.com
Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[v4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e075b7780421..cbbea277b3fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -806,6 +806,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_mm_init(mm);
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-- 
cgit v1.2.3


From 3510ca20ece0150af6b10c77a74ff1b5c198e3e2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 27 Aug 2017 13:55:12 -0700
Subject: Minor page waitqueue cleanups

Tim Chen and Kan Liang have been battling a customer load that shows
extremely long page wakeup lists.  The cause seems to be constant NUMA
migration of a hot page that is shared across a lot of threads, but the
actual root cause for the exact behavior has not been found.

Tim has a patch that batches the wait list traversal at wakeup time, so
that we at least don't get long uninterruptible cases where we traverse
and wake up thousands of processes and get nasty latency spikes.  That
is likely 4.14 material, but we're still discussing the page waitqueue
specific parts of it.

In the meantime, I've tried to look at making the page wait queues less
expensive, and failing miserably.  If you have thousands of threads
waiting for the same page, it will be painful.  We'll need to try to
figure out the NUMA balancing issue some day, in addition to avoiding
the excessive spinlock hold times.

That said, having tried to rewrite the page wait queues, I can at least
fix up some of the braindamage in the current situation. In particular:

 (a) we don't want to continue walking the page wait list if the bit
     we're waiting for already got set again (which seems to be one of
     the patterns of the bad load).  That makes no progress and just
     causes pointless cache pollution chasing the pointers.

 (b) we don't want to put the non-locking waiters always on the front of
     the queue, and the locking waiters always on the back.  Not only is
     that unfair, it means that we wake up thousands of reading threads
     that will just end up being blocked by the writer later anyway.

Also add a comment about the layout of 'struct wait_page_key' - there is
an external user of it in the cachefiles code that means that it has to
match the layout of 'struct wait_bit_key' in the two first members.  It
so happens to match, because 'struct page *' and 'unsigned long *' end
up having the same values simply because the page flags are the first
member in struct page.

Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Christopher Lameter <cl@linux.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/wait.c |  7 ++++---
 mm/filemap.c        | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 
 	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
-
-		if (curr->func(curr, mode, wake_flags, key) &&
-				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+		int ret = curr->func(curr, mode, wake_flags, key);
+		if (ret < 0)
+			break;
+		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..baba290c276b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -885,6 +885,7 @@ void __init pagecache_init(void)
 	page_writeback_init();
 }
 
+/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
 struct wait_page_key {
 	struct page *page;
 	int bit_nr;
@@ -909,8 +910,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 
 	if (wait_page->bit_nr != key->bit_nr)
 		return 0;
+
+	/* Stop walking if it's locked */
 	if (test_bit(key->bit_nr, &key->page->flags))
-		return 0;
+		return -1;
 
 	return autoremove_wake_function(wait, mode, sync, key);
 }
@@ -964,6 +967,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	int ret = 0;
 
 	init_wait(wait);
+	wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
 	wait->func = wake_page_function;
 	wait_page.page = page;
 	wait_page.bit_nr = bit_nr;
@@ -972,10 +976,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		spin_lock_irq(&q->lock);
 
 		if (likely(list_empty(&wait->entry))) {
-			if (lock)
-				__add_wait_queue_entry_tail_exclusive(q, wait);
-			else
-				__add_wait_queue(q, wait);
+			__add_wait_queue_entry_tail(q, wait);
 			SetPageWaiters(page);
 		}
 
-- 
cgit v1.2.3


From 22cf8bc6cb0d48c6d31da8eccbd4201ab9b0c4c6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 31 Aug 2017 16:15:23 -0700
Subject: kernel/kthread.c: kthread_worker: don't hog the cpu

If the worker thread continues getting work, it will hog the cpu and rcu
stall complains.  Make it a good citizen.  This is triggered in a loop
block device test.

Link: http://lkml.kernel.org/r/5de0a179b3184e1a2183fc503448b0269f24d75b.1503697127.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..1c19edf82427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
 		schedule();
 
 	try_to_freeze();
+	cond_resched();
 	goto repeat;
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
-- 
cgit v1.2.3


From 355627f518978b5167256d27492fe0b343aaf2f2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 31 Aug 2017 16:15:26 -0700
Subject: mm, uprobes: fix multiple free of ->uprobes_state.xol_area

Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for
write killable") made it possible to kill a forking task while it is
waiting to acquire its ->mmap_sem for write, in dup_mmap().

However, it was overlooked that this introduced an new error path before
the new mm_struct's ->uprobes_state.xol_area has been set to NULL after
being copied from the old mm_struct by the memcpy in dup_mm().  For a
task that has previously hit a uprobe tracepoint, this resulted in the
'struct xol_area' being freed multiple times if the task was killed at
just the right time while forking.

Fix it by setting ->uprobes_state.xol_area to NULL in mm_init() rather
than in uprobe_dup_mmap().

With CONFIG_UPROBE_EVENTS=y, the bug can be reproduced by the same C
program given by commit 2b7e8665b4ff ("fork: fix incorrect fput of
->exe_file causing use-after-free"), provided that a uprobe tracepoint
has been set on the fork_thread() function.  For example:

    $ gcc reproducer.c -o reproducer -lpthread
    $ nm reproducer | grep fork_thread
    0000000000400719 t fork_thread
    $ echo "p $PWD/reproducer:0x719" > /sys/kernel/debug/tracing/uprobe_events
    $ echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
    $ ./reproducer

Here is the use-after-free reported by KASAN:

    BUG: KASAN: use-after-free in uprobe_clear_state+0x1c4/0x200
    Read of size 8 at addr ffff8800320a8b88 by task reproducer/198

    CPU: 1 PID: 198 Comm: reproducer Not tainted 4.13.0-rc7-00015-g36fde05f3fb5 #255
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
    Call Trace:
     dump_stack+0xdb/0x185
     print_address_description+0x7e/0x290
     kasan_report+0x23b/0x350
     __asan_report_load8_noabort+0x19/0x20
     uprobe_clear_state+0x1c4/0x200
     mmput+0xd6/0x360
     do_exit+0x740/0x1670
     do_group_exit+0x13f/0x380
     get_signal+0x597/0x17d0
     do_signal+0x99/0x1df0
     exit_to_usermode_loop+0x166/0x1e0
     syscall_return_slowpath+0x258/0x2c0
     entry_SYSCALL_64_fastpath+0xbc/0xbe

    ...

    Allocated by task 199:
     save_stack_trace+0x1b/0x20
     kasan_kmalloc+0xfc/0x180
     kmem_cache_alloc_trace+0xf3/0x330
     __create_xol_area+0x10f/0x780
     uprobe_notify_resume+0x1674/0x2210
     exit_to_usermode_loop+0x150/0x1e0
     prepare_exit_to_usermode+0x14b/0x180
     retint_user+0x8/0x20

    Freed by task 199:
     save_stack_trace+0x1b/0x20
     kasan_slab_free+0xa8/0x1a0
     kfree+0xba/0x210
     uprobe_clear_state+0x151/0x200
     mmput+0xd6/0x360
     copy_process.part.8+0x605f/0x65d0
     _do_fork+0x1a5/0xbd0
     SyS_clone+0x19/0x20
     do_syscall_64+0x22f/0x660
     return_from_SYSCALL_64+0x0/0x7a

Note: without KASAN, you may instead see a "Bad page state" message, or
simply a general protection fault.

Link: http://lkml.kernel.org/r/20170830033303.17927-1-ebiggers3@gmail.com
Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>    [4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 --
 kernel/fork.c           | 8 ++++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f98a50c..267f6ef91d97 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
 
 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
 {
-	newmm->uprobes_state.xol_area = NULL;
-
 	if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
 		set_bit(MMF_HAS_UPROBES, &newmm->flags);
 		/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/fork.c b/kernel/fork.c
index cbbea277b3fb..b7e9e57b71ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+	mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
@@ -812,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
+	mm_init_uprobes_state(mm);
 
 	if (current->mm) {
 		mm->flags = current->mm->flags & MMF_INIT_MASK;
-- 
cgit v1.2.3