From 77f88796cee819b9c4562b0b6b44691b3b7755b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Mar 2017 16:54:24 -0400
Subject: cgroup, kthread: close race window where new kthreads can be migrated
 to non-root cgroups

Creation of a kthread goes through a couple interlocked stages between
the kthread itself and its creator.  Once the new kthread starts
running, it initializes itself and wakes up the creator.  The creator
then can further configure the kthread and then let it start doing its
job by waking it up.

In this configuration-by-creator stage, the creator is the only one
that can wake it up but the kthread is visible to userland.  When
altering the kthread's attributes from userland is allowed, this is
fine; however, for cases where CPU affinity is critical,
kthread_bind() is used to first disable affinity changes from userland
and then set the affinity.  This also prevents the kthread from being
migrated into non-root cgroups as that can affect the CPU affinity and
many other things.

Unfortunately, the cgroup side of protection is racy.  While the
PF_NO_SETAFFINITY flag prevents further migrations, userland can win
the race before the creator sets the flag with kthread_bind() and put
the kthread in a non-root cgroup, which can lead to all sorts of
problems including incorrect CPU affinity and starvation.

This bug got triggered by userland which periodically tries to migrate
all processes in the root cpuset cgroup to a non-root one.  Per-cpu
workqueue workers got caught while being created and ended up with
incorrected CPU affinity breaking concurrency management and sometimes
stalling workqueue execution.

This patch adds task->no_cgroup_migration which disallows the task to
be migrated by userland.  kthreadd starts with the flag set making
every child kthread start in the root cgroup with migration
disallowed.  The flag is cleared after the kthread finishes
initialization by which time PF_NO_SETAFFINITY is set if the kthread
should stay in the root cgroup.

It'd be better to wait for the initialization instead of failing but I
couldn't think of a way of implementing that without adding either a
new PF flag, or sleeping and retrying from waiting side.  Even if
userland depends on changing cgroup membership of a kthread, it either
has to be synchronized with kthread_create() or periodically repeat,
so it's unlikely that this would break anything.

v2: Switch to a simpler implementation using a new task_struct bit
    field suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-and-debugged-by: Chris Mason <clm@fb.com>
Cc: stable@vger.kernel.org # v4.3+ (we can't close the race on < v4.3)
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 9 +++++----
 kernel/kthread.c       | 3 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..638ef7568495 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 		tsk = tsk->group_leader;
 
 	/*
-	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-	 * trapped in a cpuset, or RT worker may be born in a cgroup
-	 * with no rt_runtime allocated.  Just say no.
+	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+	 * If userland migrates such a kthread to a non-root cgroup, it can
+	 * become trapped in a cpuset, or RT kthread may be born in a
+	 * cgroup with no rt_runtime allocated.  Just say no.
 	 */
-	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out_unlock_rcu;
 	}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2f26adea0f84..26db528c1d88 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
 
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+		cgroup_kthread_ready();
 		__kthread_parkme(self);
 		ret = threadfn(data);
 	}
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
+	cgroup_init_kthreadd();
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 71fdb70eb48784c1f28cdf2e67c4c587dd7f2594 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Mar 2017 13:46:21 +0100
Subject: sched/clock: Fix clear_sched_clock_stable() preempt wobbly

Paul reported a problems with clear_sched_clock_stable(). Since we run
all of __clear_sched_clock_stable() from workqueue context, there's a
preempt problem.

Solve it by only running the static_key_disable() from workqueue.

Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Link: http://lkml.kernel.org/r/20170313124621.GA3328@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a08795e21628..fec0f58c8dee 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -141,7 +141,14 @@ static void __set_sched_clock_stable(void)
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
-static void __clear_sched_clock_stable(struct work_struct *work)
+static void __sched_clock_work(struct work_struct *work)
+{
+	static_branch_disable(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __sched_clock_work);
+
+static void __clear_sched_clock_stable(void)
 {
 	struct sched_clock_data *scd = this_scd();
 
@@ -160,11 +167,11 @@ static void __clear_sched_clock_stable(struct work_struct *work)
 			scd->tick_gtod, gtod_offset,
 			scd->tick_raw,  raw_offset);
 
-	static_branch_disable(&__sched_clock_stable);
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-}
 
-static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+	if (sched_clock_stable())
+		schedule_work(&sched_clock_work);
+}
 
 void clear_sched_clock_stable(void)
 {
@@ -173,7 +180,7 @@ void clear_sched_clock_stable(void)
 	smp_mb(); /* matches sched_clock_init_late() */
 
 	if (sched_clock_running == 2)
-		schedule_work(&sched_clock_work);
+		__clear_sched_clock_stable();
 }
 
 void sched_clock_init_late(void)
-- 
cgit v1.2.3


From 698eff6355f735d46d1b7113df8b422874cd7988 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Mar 2017 12:48:18 +0100
Subject: sched/clock, x86/perf: Fix "perf test tsc"

People reported that commit:

  5680d8094ffa ("sched/clock: Provide better clock continuity")

broke "perf test tsc".

That commit added another offset to the reported clock value; so
take that into account when computing the provided offset values.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 5680d8094ffa ("sched/clock: Provide better clock continuity")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c       |  9 ++++++---
 arch/x86/include/asm/timer.h |  2 ++
 arch/x86/kernel/tsc.c        |  4 ++--
 include/linux/sched/clock.h  | 13 +++++++------
 kernel/sched/clock.c         | 22 +++++++++++-----------
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2aa1ad194db2..580b60f5ac83 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2256,6 +2256,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 			       struct perf_event_mmap_page *userpg, u64 now)
 {
 	struct cyc2ns_data *data;
+	u64 offset;
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
@@ -2263,11 +2264,13 @@ void arch_perf_update_userpage(struct perf_event *event,
 		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
-	if (!sched_clock_stable())
+	if (!using_native_sched_clock() || !sched_clock_stable())
 		return;
 
 	data = cyc2ns_read_begin();
 
+	offset = data->cyc2ns_offset + __sched_clock_offset;
+
 	/*
 	 * Internal timekeeping for enabled/running/stopped times
 	 * is always in the local_clock domain.
@@ -2275,7 +2278,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
-	userpg->time_offset = data->cyc2ns_offset - now;
+	userpg->time_offset = offset - now;
 
 	/*
 	 * cap_user_time_zero doesn't make sense when we're using a different
@@ -2283,7 +2286,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	 */
 	if (!event->attr.use_clockid) {
 		userpg->cap_user_time_zero = 1;
-		userpg->time_zero = data->cyc2ns_offset;
+		userpg->time_zero = offset;
 	}
 
 	cyc2ns_read_end(data);
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index a04eabd43d06..27e9f9d769b8 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,6 +12,8 @@ extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
+extern bool using_native_sched_clock(void);
+
 /*
  * We use the full linear equation: f(x) = a + b*x, in order to allow
  * a continuous function in the face of dynamic freq changes.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c73a7f9e881a..714dfba6a1e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -328,7 +328,7 @@ unsigned long long sched_clock(void)
 	return paravirt_sched_clock();
 }
 
-static inline bool using_native_sched_clock(void)
+bool using_native_sched_clock(void)
 {
 	return pv_time_ops.sched_clock == native_sched_clock;
 }
@@ -336,7 +336,7 @@ static inline bool using_native_sched_clock(void)
 unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 
-static inline bool using_native_sched_clock(void) { return true; }
+bool using_native_sched_clock(void) { return true; }
 #endif
 
 int check_tsc_unstable(void)
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 4a68c6791207..34fe92ce1ebd 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -54,15 +54,16 @@ static inline u64 local_clock(void)
 }
 #else
 extern void sched_clock_init_late(void);
-/*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
- */
 extern int sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
+/*
+ * When sched_clock_stable(), __sched_clock_offset provides the offset
+ * between local_clock() and sched_clock().
+ */
+extern u64 __sched_clock_offset;
+
+
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index fec0f58c8dee..24a3e01bf8cb 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early = 1;
 
 /*
- * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
  */
-static __read_mostly u64 raw_offset;
-static __read_mostly u64 gtod_offset;
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
 
 struct sched_clock_data {
 	u64			tick_raw;
@@ -131,11 +131,11 @@ static void __set_sched_clock_stable(void)
 	/*
 	 * Attempt to make the (initial) unstable->stable transition continuous.
 	 */
-	raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
 
 	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
-			scd->tick_gtod, gtod_offset,
-			scd->tick_raw,  raw_offset);
+			scd->tick_gtod, __gtod_offset,
+			scd->tick_raw,  __sched_clock_offset);
 
 	static_branch_enable(&__sched_clock_stable);
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
@@ -161,11 +161,11 @@ static void __clear_sched_clock_stable(void)
 	 *
 	 * Still do what we can.
 	 */
-	gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+	__gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
 
 	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
-			scd->tick_gtod, gtod_offset,
-			scd->tick_raw,  raw_offset);
+			scd->tick_gtod, __gtod_offset,
+			scd->tick_raw,  __sched_clock_offset);
 
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 
@@ -238,7 +238,7 @@ again:
 	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
-	clock = scd->tick_gtod + gtod_offset + delta;
+	clock = scd->tick_gtod + __gtod_offset + delta;
 	min_clock = wrap_max(scd->tick_gtod, old_clock);
 	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
 
@@ -324,7 +324,7 @@ u64 sched_clock_cpu(int cpu)
 	u64 clock;
 
 	if (sched_clock_stable())
-		return sched_clock() + raw_offset;
+		return sched_clock() + __sched_clock_offset;
 
 	if (unlikely(!sched_clock_running))
 		return 0ull;
-- 
cgit v1.2.3


From de5540d088fe97ad583cc7d396586437b32149a5 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 23 Mar 2017 12:24:43 +0100
Subject: padata: avoid race in reordering

Under extremely heavy uses of padata, crashes occur, and with list
debugging turned on, this happens instead:

[87487.298728] WARNING: CPU: 1 PID: 882 at lib/list_debug.c:33
__list_add+0xae/0x130
[87487.301868] list_add corruption. prev->next should be next
(ffffb17abfc043d0), but was ffff8dba70872c80. (prev=ffff8dba70872b00).
[87487.339011]  [<ffffffff9a53d075>] dump_stack+0x68/0xa3
[87487.342198]  [<ffffffff99e119a1>] ? console_unlock+0x281/0x6d0
[87487.345364]  [<ffffffff99d6b91f>] __warn+0xff/0x140
[87487.348513]  [<ffffffff99d6b9aa>] warn_slowpath_fmt+0x4a/0x50
[87487.351659]  [<ffffffff9a58b5de>] __list_add+0xae/0x130
[87487.354772]  [<ffffffff9add5094>] ? _raw_spin_lock+0x64/0x70
[87487.357915]  [<ffffffff99eefd66>] padata_reorder+0x1e6/0x420
[87487.361084]  [<ffffffff99ef0055>] padata_do_serial+0xa5/0x120

padata_reorder calls list_add_tail with the list to which its adding
locked, which seems correct:

spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
spin_unlock(&squeue->serial.lock);

This therefore leaves only place where such inconsistency could occur:
if padata->list is added at the same time on two different threads.
This pdata pointer comes from the function call to
padata_get_next(pd), which has in it the following block:

next_queue = per_cpu_ptr(pd->pqueue, cpu);
padata = NULL;
reorder = &next_queue->reorder;
if (!list_empty(&reorder->list)) {
       padata = list_entry(reorder->list.next,
                           struct padata_priv, list);
       spin_lock(&reorder->lock);
       list_del_init(&padata->list);
       atomic_dec(&pd->reorder_objects);
       spin_unlock(&reorder->lock);

       pd->processed++;

       goto out;
}
out:
return padata;

I strongly suspect that the problem here is that two threads can race
on reorder list. Even though the deletion is locked, call to
list_entry is not locked, which means it's feasible that two threads
pick up the same padata object and subsequently call list_add_tail on
them at the same time. The fix is thus be hoist that lock outside of
that block.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 05316c9f32da..3202aa17492c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -186,19 +186,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 
 	reorder = &next_queue->reorder;
 
+	spin_lock(&reorder->lock);
 	if (!list_empty(&reorder->list)) {
 		padata = list_entry(reorder->list.next,
 				    struct padata_priv, list);
 
-		spin_lock(&reorder->lock);
 		list_del_init(&padata->list);
 		atomic_dec(&pd->reorder_objects);
-		spin_unlock(&reorder->lock);
 
 		pd->processed++;
 
+		spin_unlock(&reorder->lock);
 		goto out;
 	}
+	spin_unlock(&reorder->lock);
 
 	if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
 		padata = ERR_PTR(-ENODATA);
-- 
cgit v1.2.3


From b1977682a3858b5584ffea7cfb7bd863f68db18d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 24 Mar 2017 15:57:33 -0700
Subject: bpf: improve verifier packet range checks

llvm can optimize the 'if (ptr > data_end)' checks to be in the order
slightly different than the original C code which will confuse verifier.
Like:
if (ptr + 16 > data_end)
  return TC_ACT_SHOT;
// may be followed by
if (ptr + 14 > data_end)
  return TC_ACT_SHOT;
while llvm can see that 'ptr' is valid for all 16 bytes,
the verifier could not.
Fix verifier logic to account for such case and add a test.

Reported-by: Huapeng Zhou <hzhou@fb.com>
Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |  5 +++--
 tools/testing/selftests/bpf/test_verifier.c | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 796b68d00119..5e6202e62265 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1973,14 +1973,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
-			regs[i].range = dst_reg->off;
+			/* keep the maximum range already checked */
+			regs[i].range = max(regs[i].range, dst_reg->off);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
 		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
-			reg->range = dst_reg->off;
+			reg->range = max(reg->range, dst_reg->off);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d1555e4240c0..7d761d4cc759 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3417,6 +3417,26 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_LWT_XMIT,
 	},
+	{
+		"overlapping checks for direct packet access",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_LWT_XMIT,
+	},
 	{
 		"invalid access of tc_classid for LWT_IN",
 		.insns = {
-- 
cgit v1.2.3


From 7b09cc5a9debc86c903c2eff8f8a1fdef773c649 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 22 Mar 2017 16:24:17 -0400
Subject: sched/clock: Fix broken stable to unstable transfer

When it is determined that the clock is actually unstable, and
we switch from stable to unstable, the __clear_sched_clock_stable()
function is eventually called.

In this function we set gtod_offset so the following holds true:

  sched_clock() + raw_offset == ktime_get_ns() + gtod_offset

But instead of getting the latest timestamps, we use the last values
from scd, so instead of sched_clock() we use scd->tick_raw, and
instead of ktime_get_ns() we use scd->tick_gtod.

However, later, when we use gtod_offset sched_clock_local() we do not
add it to scd->tick_gtod to calculate the correct clock value when we
determine the boundaries for min/max clocks.

This can result in tick granularity sched_clock() values, so fix it.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hpa@zytor.com
Fixes: 5680d8094ffa ("sched/clock: Provide better clock continuity")
Link: http://lkml.kernel.org/r/1490214265-899964-2-git-send-email-pasha.tatashin@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 24a3e01bf8cb..00a45c45beca 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -221,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y)
  */
 static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-	u64 now, clock, old_clock, min_clock, max_clock;
+	u64 now, clock, old_clock, min_clock, max_clock, gtod;
 	s64 delta;
 
 again:
@@ -238,9 +238,10 @@ again:
 	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
-	clock = scd->tick_gtod + __gtod_offset + delta;
-	min_clock = wrap_max(scd->tick_gtod, old_clock);
-	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+	gtod = scd->tick_gtod + __gtod_offset;
+	clock = gtod + delta;
+	min_clock = wrap_max(gtod, old_clock);
+	max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
 
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
-- 
cgit v1.2.3


From ab6434a1377a768a1e6d3e6cf819eb21724a99c2 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 27 Mar 2017 14:30:06 -0400
Subject: audit: move audit_signal_info() into kernel/auditsc.c

Commit 5b52330bbfe6 ("audit: fix auditd/kernel connection state
tracking") made inlining audit_signal_info() a bit pointless as
it was always calling into auditd_test_task() so let's remove the
inline function in kernel/audit.h and convert __audit_signal_info()
in kernel/auditsc.c into audit_signal_info().

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h   |  8 +-------
 kernel/auditsc.c | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 0f1cf6d1878a..0d87f8ab8778 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -333,13 +333,7 @@ extern u32 audit_sig_sid;
 extern int audit_filter(int msgtype, unsigned int listtype);
 
 #ifdef CONFIG_AUDITSYSCALL
-extern int __audit_signal_info(int sig, struct task_struct *t);
-static inline int audit_signal_info(int sig, struct task_struct *t)
-{
-	if (auditd_test_task(t) || (audit_signals && !audit_dummy_context()))
-		return __audit_signal_info(sig, t);
-	return 0;
-}
+extern int audit_signal_info(int sig, struct task_struct *t);
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
 extern struct list_head *audit_killed_trees(void);
 #else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e59ffc7fc522..1c2333155893 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2249,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t)
  * If the audit subsystem is being terminated, record the task (pid)
  * and uid that is doing that.
  */
-int __audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info(int sig, struct task_struct *t)
 {
 	struct audit_aux_data_pids *axp;
 	struct task_struct *tsk = current;
 	struct audit_context *ctx = tsk->audit_context;
 	kuid_t uid = current_uid(), t_uid = task_uid(t);
 
-	if (auditd_test_task(t)) {
-		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-			audit_sig_pid = task_tgid_nr(tsk);
-			if (uid_valid(tsk->loginuid))
-				audit_sig_uid = tsk->loginuid;
-			else
-				audit_sig_uid = uid;
-			security_task_getsecid(tsk, &audit_sig_sid);
-		}
-		if (!audit_signals || audit_dummy_context())
-			return 0;
+	if (auditd_test_task(t) &&
+	    (sig == SIGTERM || sig == SIGHUP ||
+	     sig == SIGUSR1 || sig == SIGUSR2)) {
+		audit_sig_pid = task_tgid_nr(tsk);
+		if (uid_valid(tsk->loginuid))
+			audit_sig_uid = tsk->loginuid;
+		else
+			audit_sig_uid = uid;
+		security_task_getsecid(tsk, &audit_sig_sid);
 	}
 
+	if (!audit_signals || audit_dummy_context())
+		return 0;
+
 	/* optimize the common case by putting first signal recipient directly
 	 * in audit_context */
 	if (!ctx->target_pid) {
-- 
cgit v1.2.3


From fce366a9dd0ddc47e7ce05611c266e8574a45116 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 31 Mar 2017 02:24:02 +0200
Subject: bpf, verifier: fix alu ops against map_value{, _adj} register types

While looking into map_value_adj, I noticed that alu operations
directly on the map_value() resp. map_value_adj() register (any
alu operation on a map_value() register will turn it into a
map_value_adj() typed register) are not sufficiently protected
against some of the operations. Two non-exhaustive examples are
provided that the verifier needs to reject:

 i) BPF_AND on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xbf842a00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (57) r0 &= 8
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=8 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

ii) BPF_ADD in 32 bit mode on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xc24eee00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (04) (u32) r0 += (u32) 0
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

Issue is, while min_value / max_value boundaries for the access
are adjusted appropriately, we change the pointer value in a way
that cannot be sufficiently tracked anymore from its origin.
Operations like BPF_{AND,OR,DIV,MUL,etc} on a destination register
that is PTR_TO_MAP_VALUE{,_ADJ} was probably unintended, in fact,
all the test cases coming with 484611357c19 ("bpf: allow access
into map value arrays") perform BPF_ADD only on the destination
register that is PTR_TO_MAP_VALUE_ADJ.

Only for UNKNOWN_VALUE register types such operations make sense,
f.e. with unknown memory content fetched initially from a constant
offset from the map value memory into a register. That register is
then later tested against lower / upper bounds, so that the verifier
can then do the tracking of min_value / max_value, and properly
check once that UNKNOWN_VALUE register is added to the destination
register with type PTR_TO_MAP_VALUE{,_ADJ}. This is also what the
original use-case is solving. Note, tracking on what is being
added is done through adjust_reg_min_max_vals() and later access
to the map value enforced with these boundaries and the given offset
from the insn through check_map_access_adj().

Tests will fail for non-root environment due to prohibited pointer
arithmetic, in particular in check_alu_op(), we bail out on the
is_pointer_value() check on the dst_reg (which is false in root
case as we allow for pointer arithmetic via env->allow_ptr_leaks).

Similarly to PTR_TO_PACKET, one way to fix it is to restrict the
allowed operations on PTR_TO_MAP_VALUE{,_ADJ} registers to 64 bit
mode BPF_ADD. The test_verifier suite runs fine after the patch
and it also rejects mentioned test cases.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e6202e62265..86deddecff25 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1925,6 +1925,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		 * register as unknown.
 		 */
 		if (env->allow_ptr_leaks &&
+		    BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
 		    (dst_reg->type == PTR_TO_MAP_VALUE ||
 		     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
 			dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
-- 
cgit v1.2.3


From 79adffcd6489ef43bda2dfded3d637d7fb4fac80 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 31 Mar 2017 02:24:03 +0200
Subject: bpf, verifier: fix rejection of unaligned access checks for
 map_value_adj

Currently, the verifier doesn't reject unaligned access for map_value_adj
register types. Commit 484611357c19 ("bpf: allow access into map value
arrays") added logic to check_ptr_alignment() extending it from PTR_TO_PACKET
to also PTR_TO_MAP_VALUE_ADJ, but for PTR_TO_MAP_VALUE_ADJ no enforcement
is in place, because reg->id for PTR_TO_MAP_VALUE_ADJ reg types is never
non-zero, meaning, we can cause BPF_H/_W/_DW-based unaligned access for
architectures not supporting efficient unaligned access, and thus worst
case could raise exceptions on some archs that are unable to correct the
unaligned access or perform a different memory access to the actual
requested one and such.

i) Unaligned load with !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
   on r0 (map_value_adj):

   0: (bf) r2 = r10
   1: (07) r2 += -8
   2: (7a) *(u64 *)(r2 +0) = 0
   3: (18) r1 = 0x42533a00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+11
    R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
   7: (61) r1 = *(u32 *)(r0 +0)
   8: (35) if r1 >= 0xb goto pc+9
    R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=10 R10=fp
   9: (07) r0 += 3
  10: (79) r7 = *(u64 *)(r0 +0)
    R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R1=inv,min_value=0,max_value=10 R10=fp
  11: (79) r7 = *(u64 *)(r0 +2)
    R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R1=inv,min_value=0,max_value=10 R7=inv R10=fp
  [...]

ii) Unaligned store with !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
    on r0 (map_value_adj):

   0: (bf) r2 = r10
   1: (07) r2 += -8
   2: (7a) *(u64 *)(r2 +0) = 0
   3: (18) r1 = 0x4df16a00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+19
    R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
   7: (07) r0 += 3
   8: (7a) *(u64 *)(r0 +0) = 42
    R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
   9: (7a) *(u64 *)(r0 +2) = 43
    R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
  10: (7a) *(u64 *)(r0 -2) = 44
    R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
  [...]

For the PTR_TO_PACKET type, reg->id is initially zero when skb->data
was fetched, it later receives a reg->id from env->id_gen generator
once another register with UNKNOWN_VALUE type was added to it via
check_packet_ptr_add(). The purpose of this reg->id is twofold: i) it
is used in find_good_pkt_pointers() for setting the allowed access
range for regs with PTR_TO_PACKET of same id once verifier matched
on data/data_end tests, and ii) for check_ptr_alignment() to determine
that when not having efficient unaligned access and register with
UNKNOWN_VALUE was added to PTR_TO_PACKET, that we're only allowed
to access the content bytewise due to unknown unalignment. reg->id
was never intended for PTR_TO_MAP_VALUE{,_ADJ} types and thus is
always zero, the only marking is in PTR_TO_MAP_VALUE_OR_NULL that
was added after 484611357c19 via 57a09bf0a416 ("bpf: Detect identical
PTR_TO_MAP_VALUE_OR_NULL registers"). Above tests will fail for
non-root environment due to prohibited pointer arithmetic.

The fix splits register-type specific checks into their own helper
instead of keeping them combined, so we don't run into a similar
issue in future once we extend check_ptr_alignment() further and
forget to add reg->type checks for some of the checks.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 58 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86deddecff25..a834068a400e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	}
 }
 
-static int check_ptr_alignment(struct bpf_verifier_env *env,
-			       struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+				   int off, int size)
 {
-	if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
-		if (off % size != 0) {
-			verbose("misaligned access off %d size %d\n",
-				off, size);
-			return -EACCES;
-		} else {
-			return 0;
-		}
-	}
-
-	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		/* misaligned access to packet is ok on x86,arm,arm64 */
-		return 0;
-
 	if (reg->id && size != 1) {
-		verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+		verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
 		return -EACCES;
 	}
 
 	/* skb->data is NET_IP_ALIGN-ed */
-	if (reg->type == PTR_TO_PACKET &&
-	    (NET_IP_ALIGN + reg->off + off) % size != 0) {
+	if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
 		verbose("misaligned packet access off %d+%d+%d size %d\n",
 			NET_IP_ALIGN, reg->off, off, size);
 		return -EACCES;
 	}
+
 	return 0;
 }
 
+static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
+				   int size)
+{
+	if (size != 1) {
+		verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+static int check_ptr_alignment(const struct bpf_reg_state *reg,
+			       int off, int size)
+{
+	switch (reg->type) {
+	case PTR_TO_PACKET:
+		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+		       check_pkt_ptr_alignment(reg, off, size);
+	case PTR_TO_MAP_VALUE_ADJ:
+		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+		       check_val_ptr_alignment(reg, size);
+	default:
+		if (off % size != 0) {
+			verbose("misaligned access off %d size %d\n",
+				off, size);
+			return -EACCES;
+		}
+
+		return 0;
+	}
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 	if (size < 0)
 		return size;
 
-	err = check_ptr_alignment(env, reg, off, size);
+	err = check_ptr_alignment(reg, off, size);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 7bf8222b9bd0ba867e18b7f4537b61ef2e92eee8 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 3 Apr 2017 15:25:53 -0400
Subject: irq/affinity: Fix CPU spread for unbalanced nodes

The irq_create_affinity_masks routine is responsible for assigning a
number of interrupt vectors to CPUs. The optimal assignemnet will spread
requested vectors to all CPUs, with the fewest CPUs sharing a vector.

The algorithm may fail to assign some vectors to any CPUs if a node's
CPU count is lower than the average number of vectors per node. These
vectors are unusable and create an un-optimal spread.

Recalculate the number of vectors to assign at each node iteration by using
the remaining number of vectors and nodes to be assigned, not exceeding the
number of CPUs in that node. This will guarantee that every CPU is assigned
at least one vector.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: linux-nvme@lists.infradead.org
Link: http://lkml.kernel.org/r/1491247553-7603-1-git-send-email-keith.busch@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4544b115f5eb..dc529116f7e6 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
-	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+	int n, nodes, cpus_per_vec, extra_vecs, curvec;
 	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
 	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
@@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		goto done;
 	}
 
-	/* Spread the vectors per node */
-	vecs_per_node = affv / nodes;
-	/* Account for rounding errors */
-	extra_vecs = affv - (nodes * vecs_per_node);
-
 	for_each_node_mask(n, nodemsk) {
-		int ncpus, v, vecs_to_assign = vecs_per_node;
+		int ncpus, v, vecs_to_assign, vecs_per_node;
+
+		/* Spread the vectors per node */
+		vecs_per_node = (affv - curvec) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
 		cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
+		vecs_to_assign = min(vecs_per_node, ncpus);
+
+		/* Account for rounding errors */
+		extra_vecs = ncpus - vecs_to_assign;
 
 		for (v = 0; curvec < last_affv && v < vecs_to_assign;
 		     curvec++, v++) {
@@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 			/* Account for extra vectors to compensate rounding errors */
 			if (extra_vecs) {
 				cpus_per_vec++;
-				if (!--extra_vecs)
-					vecs_per_node++;
+				--extra_vecs;
 			}
 			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
 		}
 
 		if (curvec >= last_affv)
 			break;
+		--nodes;
 	}
 
 done:
-- 
cgit v1.2.3


From 62277de758b155dc04b78f195a1cb5208c37b2df Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 17 Jun 2016 17:33:59 +0000
Subject: ring-buffer: Fix return value check in test_ringbuffer()

In case of error, the function kthread_run() returns ERR_PTR()
and never returns NULL. The NULL test in the return value check
should be replaced with IS_ERR().

Link: http://lkml.kernel.org/r/1466184839-14927-1-git-send-email-weiyj_lk@163.com

Cc: stable@vger.kernel.org
Fixes: 6c43e554a ("ring-buffer: Add ring buffer startup selftest")
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96fc3c043ad6..54e7a90db848 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4826,9 +4826,9 @@ static __init int test_ringbuffer(void)
 		rb_data[cpu].cnt = cpu;
 		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
 						 "rbtester/%d", cpu);
-		if (WARN_ON(!rb_threads[cpu])) {
+		if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
 			pr_cont("FAILED\n");
-			ret = -1;
+			ret = PTR_ERR(rb_threads[cpu]);
 			goto out_free;
 		}
 
@@ -4838,9 +4838,9 @@ static __init int test_ringbuffer(void)
 
 	/* Now create the rb hammer! */
 	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
-	if (WARN_ON(!rb_hammer)) {
+	if (WARN_ON(IS_ERR(rb_hammer))) {
 		pr_cont("FAILED\n");
-		ret = -1;
+		ret = PTR_ERR(rb_hammer);
 		goto out_free;
 	}
 
-- 
cgit v1.2.3


From 5380e5644afbba9e3d229c36771134976f05c91e Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Fri, 7 Apr 2017 23:51:06 +0800
Subject: sysctl: don't print negative flag for proc_douintvec

I saw some very confusing sysctl output on my system:
  # cat /proc/sys/net/core/xfrm_aevent_rseqth
  -2
  # cat /proc/sys/net/core/xfrm_aevent_etime
  -10
  # cat /proc/sys/net/ipv4/tcp_notsent_lowat
  -4294967295

Because we forget to set the *negp flag in proc_douintvec, so it will
become a garbage value.

Since the value related to proc_douintvec is always an unsigned integer,
so we can set *negp to false explictily to fix this issue.

Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acf0a5a06da7..8cca4c7bb21f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2136,6 +2136,7 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
 		*valp = *lvalp;
 	} else {
 		unsigned int val = *valp;
+		*negp = false;
 		*lvalp = (unsigned long)val;
 	}
 	return 0;
-- 
cgit v1.2.3


From 5402e97af667e35e54177af8f6575518bf251d51 Mon Sep 17 00:00:00 2001
From: "bsegall@google.com" <bsegall@google.com>
Date: Fri, 7 Apr 2017 16:04:51 -0700
Subject: ptrace: fix PTRACE_LISTEN race corrupting task->state

In PT_SEIZED + LISTEN mode STOP/CONT signals cause a wakeup against
__TASK_TRACED.  If this races with the ptrace_unfreeze_traced at the end
of a PTRACE_LISTEN, this can wake the task /after/ the check against
__TASK_TRACED, but before the reset of state to TASK_TRACED.  This
causes it to instead clobber TASK_WAKING, allowing a subsequent wakeup
against TRACED while the task is still on the rq wake_list, corrupting
it.

Oleg said:
 "The kernel can crash or this can lead to other hard-to-debug problems.
  In short, "task->state = TASK_TRACED" in ptrace_unfreeze_traced()
  assumes that nobody else can wake it up, but PTRACE_LISTEN breaks the
  contract. Obviusly it is very wrong to manipulate task->state if this
  task is already running, or WAKING, or it sleeps again"

[akpm@linux-foundation.org: coding-style fixes]
Fixes: 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL")
Link: http://lkml.kernel.org/r/xm26y3vfhmkp.fsf_-_@bsegall-linux.mtv.corp.google.com
Signed-off-by: Ben Segall <bsegall@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0af928712174..266ddcc1d8bb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 
 	WARN_ON(!task->ptrace || task->parent != current);
 
+	/*
+	 * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+	 * Recheck state under the lock to close this race.
+	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (__fatal_signal_pending(task))
-		wake_up_state(task, __TASK_TRACED);
-	else
-		task->state = TASK_TRACED;
+	if (task->state == __TASK_TRACED) {
+		if (__fatal_signal_pending(task))
+			wake_up_state(task, __TASK_TRACED);
+		else
+			task->state = TASK_TRACED;
+	}
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
-- 
cgit v1.2.3


From 425fffd886bae3d127a08fa6a17f2e31e24ed7ff Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Fri, 7 Apr 2017 23:51:07 +0800
Subject: sysctl: report EINVAL if value is larger than UINT_MAX for
 proc_douintvec

Currently, inputting the following command will succeed but actually the
value will be truncated:

  # echo 0x12ffffffff > /proc/sys/net/ipv4/tcp_notsent_lowat

This is not friendly to the user, so instead, we should report error
when the value is larger than UINT_MAX.

Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8cca4c7bb21f..8c8714fcb53c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2133,6 +2133,8 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
 	if (write) {
 		if (*negp)
 			return -EINVAL;
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
 		*valp = *lvalp;
 	} else {
 		unsigned int val = *valp;
-- 
cgit v1.2.3


From 264d509637d95f9404e52ced5003ad352e0f6a26 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 10 Apr 2017 11:16:59 -0400
Subject: audit: make sure we don't let the retry queue grow without bounds

The retry queue is intended to provide a temporary buffer in the case
of transient errors when communicating with auditd, it is not meant
as a long life queue, that functionality is provided by the hold
queue.

This patch fixes a problem identified by Seth where the retry queue
could grow uncontrollably if an auditd instance did not connect to
the kernel to drain the queues.  This commit fixes this by doing the
following:

* Make sure we always call auditd_reset() if we decide the connection
with audit is really dead.  There were some cases in
kauditd_hold_skb() where we did not reset the connection, this patch
relocates the reset calls to kauditd_thread() so all the error
conditions are caught and the connection reset.  As a side effect,
this means we could move auditd_reset() and get rid of the forward
definition at the top of kernel/audit.c.

* We never checked the status of the auditd connection when
processing the main audit queue which meant that the retry queue
could grow unchecked.  This patch adds a call to auditd_reset()
after the main queue has been processed if auditd is not connected,
the auditd_reset() call will make sure the retry and hold queues are
correctly managed/flushed so that the retry queue remains reasonable.

Cc: <stable@vger.kernel.org> # 4.10.x-: 5b52330bbfe6
Reported-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 67 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2f4964cfde0b..a871bf80fde1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -160,7 +160,6 @@ static LIST_HEAD(audit_freelist);
 
 /* queue msgs to send via kauditd_task */
 static struct sk_buff_head audit_queue;
-static void kauditd_hold_skb(struct sk_buff *skb);
 /* queue msgs due to temporary unicast send problems */
 static struct sk_buff_head audit_retry_queue;
 /* queue msgs waiting for new auditd connection */
@@ -453,30 +452,6 @@ static void auditd_set(int pid, u32 portid, struct net *net)
 	spin_unlock_irqrestore(&auditd_conn.lock, flags);
 }
 
-/**
- * auditd_reset - Disconnect the auditd connection
- *
- * Description:
- * Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
- */
-static void auditd_reset(void)
-{
-	struct sk_buff *skb;
-
-	/* if it isn't already broken, break the connection */
-	rcu_read_lock();
-	if (auditd_conn.pid)
-		auditd_set(0, 0, NULL);
-	rcu_read_unlock();
-
-	/* flush all of the main and retry queues to the hold queue */
-	while ((skb = skb_dequeue(&audit_retry_queue)))
-		kauditd_hold_skb(skb);
-	while ((skb = skb_dequeue(&audit_queue)))
-		kauditd_hold_skb(skb);
-}
-
 /**
  * kauditd_print_skb - Print the audit record to the ring buffer
  * @skb: audit record
@@ -505,9 +480,6 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
 {
 	/* put the record back in the queue at the same place */
 	skb_queue_head(&audit_hold_queue, skb);
-
-	/* fail the auditd connection */
-	auditd_reset();
 }
 
 /**
@@ -544,9 +516,6 @@ static void kauditd_hold_skb(struct sk_buff *skb)
 	/* we have no other options - drop the message */
 	audit_log_lost("kauditd hold queue overflow");
 	kfree_skb(skb);
-
-	/* fail the auditd connection */
-	auditd_reset();
 }
 
 /**
@@ -566,6 +535,30 @@ static void kauditd_retry_skb(struct sk_buff *skb)
 	skb_queue_tail(&audit_retry_queue, skb);
 }
 
+/**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the queued records into the
+ * hold queue in case auditd reconnects.
+ */
+static void auditd_reset(void)
+{
+	struct sk_buff *skb;
+
+	/* if it isn't already broken, break the connection */
+	rcu_read_lock();
+	if (auditd_conn.pid)
+		auditd_set(0, 0, NULL);
+	rcu_read_unlock();
+
+	/* flush all of the main and retry queues to the hold queue */
+	while ((skb = skb_dequeue(&audit_retry_queue)))
+		kauditd_hold_skb(skb);
+	while ((skb = skb_dequeue(&audit_queue)))
+		kauditd_hold_skb(skb);
+}
+
 /**
  * auditd_send_unicast_skb - Send a record via unicast to auditd
  * @skb: audit record
@@ -758,6 +751,7 @@ static int kauditd_thread(void *dummy)
 					NULL, kauditd_rehold_skb);
 		if (rc < 0) {
 			sk = NULL;
+			auditd_reset();
 			goto main_queue;
 		}
 
@@ -767,6 +761,7 @@ static int kauditd_thread(void *dummy)
 					NULL, kauditd_hold_skb);
 		if (rc < 0) {
 			sk = NULL;
+			auditd_reset();
 			goto main_queue;
 		}
 
@@ -775,16 +770,18 @@ main_queue:
 		 * unicast, dump failed record sends to the retry queue; if
 		 * sk == NULL due to previous failures we will just do the
 		 * multicast send and move the record to the retry queue */
-		kauditd_send_queue(sk, portid, &audit_queue, 1,
-				   kauditd_send_multicast_skb,
-				   kauditd_retry_skb);
+		rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
+					kauditd_send_multicast_skb,
+					kauditd_retry_skb);
+		if (sk == NULL || rc < 0)
+			auditd_reset();
+		sk = NULL;
 
 		/* drop our netns reference, no auditd sends past this line */
 		if (net) {
 			put_net(net);
 			net = NULL;
 		}
-		sk = NULL;
 
 		/* we have processed all the queues so wake everyone */
 		wake_up(&audit_backlog_wait);
-- 
cgit v1.2.3


From bfb0b80db5f9dca5ac0a5fd0edb765ee555e5a8e Mon Sep 17 00:00:00 2001
From: Zefan Li <lizefan@huawei.com>
Date: Fri, 7 Apr 2017 16:51:55 +0800
Subject: cgroup: avoid attaching a cgroup root to two different superblocks

Run this:

    touch file0
    for ((; ;))
    {
        mount -t cpuset xxx file0
    }

And this concurrently:

    touch file1
    for ((; ;))
    {
        mount -t cpuset xxx file1
    }

We'll trigger a warning like this:

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 4675 at lib/percpu-refcount.c:317 percpu_ref_kill_and_confirm+0x92/0xb0
 percpu_ref_kill_and_confirm called more than once on css_release!
 CPU: 1 PID: 4675 Comm: mount Not tainted 4.11.0-rc5+ #5
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
 Call Trace:
  dump_stack+0x63/0x84
  __warn+0xd1/0xf0
  warn_slowpath_fmt+0x5f/0x80
  percpu_ref_kill_and_confirm+0x92/0xb0
  cgroup_kill_sb+0x95/0xb0
  deactivate_locked_super+0x43/0x70
  deactivate_super+0x46/0x60
 ...
 ---[ end trace a79f61c2a2633700 ]---

Here's a race:

  Thread A				Thread B

  cgroup1_mount()
    # alloc a new cgroup root
    cgroup_setup_root()
					cgroup1_mount()
					  # no sb yet, returns NULL
					  kernfs_pin_sb()

					  # but succeeds in getting the refcnt,
					  # so re-use cgroup root
					  percpu_ref_tryget_live()
    # alloc sb with cgroup root
    cgroup_do_mount()

  cgroup_kill_sb()
					  # alloc another sb with same root
					  cgroup_do_mount()

					cgroup_kill_sb()

We end up using the same cgroup root for two different superblocks,
so percpu_ref_kill() will be called twice on the same root when the
two superblocks are destroyed.

We should fix to make sure the superblock pinning is really successful.

Cc: stable@vger.kernel.org # 3.16+
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup-v1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..12e19f0636ea 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1146,7 +1146,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
 		 * path is super cold.  Let's just sleep a bit and retry.
 		 */
 		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
+		if (IS_ERR_OR_NULL(pinned_sb) ||
 		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			if (!IS_ERR_OR_NULL(pinned_sb))
-- 
cgit v1.2.3


From 96a94cc5158859943b7e4e72ae69e572815f5413 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 11 Apr 2017 12:10:58 +0200
Subject: bpf: reference may_access_skb() from __bpf_prog_run()

It took me quite some time to figure out how this was linked,
so in order to save the next person the effort of finding it
add a comment in __bpf_prog_run() that indicates what exactly
determines that a program can access the ctx == skb.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f45827e205d3..b4f1cb0c5ac7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1162,12 +1162,12 @@ out:
 	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
 		off = IMM;
 load_word:
-		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
-		 * only appearing in the programs where ctx ==
-		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
-		 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
-		 * internal BPF verifier will check that BPF_R6 ==
-		 * ctx.
+		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+		 * appearing in the programs where ctx == skb
+		 * (see may_access_skb() in the verifier). All programs
+		 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
+		 * bpf_convert_filter() saves it in BPF_R6, internal BPF
+		 * verifier will check that BPF_R6 == ctx.
 		 *
 		 * BPF_ABS and BPF_IND are wrappers of function calls,
 		 * so they scratch BPF_R1-BPF_R5 registers, preserve
-- 
cgit v1.2.3


From 3412386b531244f24a27c79ee003506a52a00848 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 13 Apr 2017 13:28:12 -0400
Subject: irq/affinity: Fix extra vecs calculation

This fixes a math error calculating the extra_vecs. The error assumed
only 1 cpu per vector, but the value needs to account for the actual
number of cpus per vector in order to get the correct remainder for
extra CPU assignment.

Fixes: 7bf8222b9bd0 ("irq/affinity: Fix CPU spread for unbalanced nodes")
Reported-by: Xiaolong Ye <xiaolong.ye@intel.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Link: http://lkml.kernel.org/r/1492104492-19943-1-git-send-email-keith.busch@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index dc529116f7e6..d052947fe785 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		vecs_to_assign = min(vecs_per_node, ncpus);
 
 		/* Account for rounding errors */
-		extra_vecs = ncpus - vecs_to_assign;
+		extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
 
 		for (v = 0; curvec < last_affv && v < vecs_to_assign;
 		     curvec++, v++) {
-- 
cgit v1.2.3


From 82cc4fc2e70ec5baeff8f776f2773abc8b2cc0ae Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 14 Apr 2017 17:45:45 -0400
Subject: ftrace: Fix removing of second function probe

When two function probes are added to set_ftrace_filter, and then one of
them is removed, the update to the function locations is not performed, and
the record keeping of the function states are corrupted, and causes an
ftrace_bug() to occur.

This is easily reproducable by adding two probes, removing one, and then
adding it back again.

 # cd /sys/kernel/debug/tracing
 # echo schedule:traceoff > set_ftrace_filter
 # echo do_IRQ:traceoff > set_ftrace_filter
 # echo \!do_IRQ:traceoff > /debug/tracing/set_ftrace_filter
 # echo do_IRQ:traceoff > set_ftrace_filter

Causes:
 ------------[ cut here ]------------
 WARNING: CPU: 2 PID: 1098 at kernel/trace/ftrace.c:2369 ftrace_get_addr_curr+0x143/0x220
 Modules linked in: [...]
 CPU: 2 PID: 1098 Comm: bash Not tainted 4.10.0-test+ #405
 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012
 Call Trace:
  dump_stack+0x68/0x9f
  __warn+0x111/0x130
  ? trace_irq_work_interrupt+0xa0/0xa0
  warn_slowpath_null+0x1d/0x20
  ftrace_get_addr_curr+0x143/0x220
  ? __fentry__+0x10/0x10
  ftrace_replace_code+0xe3/0x4f0
  ? ftrace_int3_handler+0x90/0x90
  ? printk+0x99/0xb5
  ? 0xffffffff81000000
  ftrace_modify_all_code+0x97/0x110
  arch_ftrace_update_code+0x10/0x20
  ftrace_run_update_code+0x1c/0x60
  ftrace_run_modify_code.isra.48.constprop.62+0x8e/0xd0
  register_ftrace_function_probe+0x4b6/0x590
  ? ftrace_startup+0x310/0x310
  ? debug_lockdep_rcu_enabled.part.4+0x1a/0x30
  ? update_stack_state+0x88/0x110
  ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320
  ? preempt_count_sub+0x18/0xd0
  ? mutex_lock_nested+0x104/0x800
  ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320
  ? __unwind_start+0x1c0/0x1c0
  ? _mutex_lock_nest_lock+0x800/0x800
  ftrace_trace_probe_callback.isra.3+0xc0/0x130
  ? func_set_flag+0xe0/0xe0
  ? __lock_acquire+0x642/0x1790
  ? __might_fault+0x1e/0x20
  ? trace_get_user+0x398/0x470
  ? strcmp+0x35/0x60
  ftrace_trace_onoff_callback+0x48/0x70
  ftrace_regex_write.isra.43.part.44+0x251/0x320
  ? match_records+0x420/0x420
  ftrace_filter_write+0x2b/0x30
  __vfs_write+0xd7/0x330
  ? do_loop_readv_writev+0x120/0x120
  ? locks_remove_posix+0x90/0x2f0
  ? do_lock_file_wait+0x160/0x160
  ? __lock_is_held+0x93/0x100
  ? rcu_read_lock_sched_held+0x5c/0xb0
  ? preempt_count_sub+0x18/0xd0
  ? __sb_start_write+0x10a/0x230
  ? vfs_write+0x222/0x240
  vfs_write+0xef/0x240
  SyS_write+0xab/0x130
  ? SyS_read+0x130/0x130
  ? trace_hardirqs_on_caller+0x182/0x280
  ? trace_hardirqs_on_thunk+0x1a/0x1c
  entry_SYSCALL_64_fastpath+0x18/0xad
 RIP: 0033:0x7fe61c157c30
 RSP: 002b:00007ffe87890258 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
 RAX: ffffffffffffffda RBX: ffffffff8114a410 RCX: 00007fe61c157c30
 RDX: 0000000000000010 RSI: 000055814798f5e0 RDI: 0000000000000001
 RBP: ffff8800c9027f98 R08: 00007fe61c422740 R09: 00007fe61ca53700
 R10: 0000000000000073 R11: 0000000000000246 R12: 0000558147a36400
 R13: 00007ffe8788f160 R14: 0000000000000024 R15: 00007ffe8788f15c
  ? trace_hardirqs_off_caller+0xc0/0x110
 ---[ end trace 99fa09b3d9869c2c ]---
 Bad trampoline accounting at: ffffffff81cc3b00 (do_IRQ+0x0/0x150)

Cc: stable@vger.kernel.org
Fixes: 59df055f1991 ("ftrace: trace different functions with a different tracer")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b9691ee8f6c1..27bb2e61276e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3755,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
 	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_probe(void)
+static bool __disable_ftrace_function_probe(void)
 {
 	int i;
 
 	if (!ftrace_probe_registered)
-		return;
+		return false;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
 		struct hlist_head *hhd = &ftrace_func_hash[i];
 		if (hhd->first)
-			return;
+			return false;
 	}
 
 	/* no more funcs left */
 	ftrace_shutdown(&trace_probe_ops, 0);
 
 	ftrace_probe_registered = 0;
+	return true;
 }
 
 
@@ -3901,6 +3902,7 @@ static void
 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
+	struct ftrace_ops_hash old_hash_ops;
 	struct ftrace_func_entry *rec_entry;
 	struct ftrace_func_probe *entry;
 	struct ftrace_func_probe *p;
@@ -3912,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	struct hlist_node *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int i, ret;
+	bool disabled;
 
 	if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
 		func_g.search = NULL;
@@ -3930,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	mutex_lock(&trace_probe_ops.func_hash->regex_lock);
 
+	old_hash_ops.filter_hash = old_hash;
+	/* Probes only have filters */
+	old_hash_ops.notrace_hash = NULL;
+
 	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
 	if (!hash)
 		/* Hmm, should report this somehow */
@@ -3967,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		}
 	}
 	mutex_lock(&ftrace_lock);
-	__disable_ftrace_function_probe();
+	disabled = __disable_ftrace_function_probe();
 	/*
 	 * Remove after the disable is called. Otherwise, if the last
 	 * probe is removed, a null hash means *all enabled*.
 	 */
 	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+	/* still need to update the function call sites */
+	if (ftrace_enabled && !disabled)
+		ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+				       &old_hash_ops);
 	synchronize_sched();
 	if (!ret)
 		free_ftrace_hash_rcu(old_hash);
-- 
cgit v1.2.3


From 330c418638612d7658b6314e6a244fcb5f7efac5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 16 Apr 2017 23:17:37 +0900
Subject: Revert "cgroup: avoid attaching a cgroup root to two different
 superblocks"

This reverts commit bfb0b80db5f9dca5ac0a5fd0edb765ee555e5a8e.

Andrei reports CRIU test hangs with the patch applied.  The bug fixed
by the patch isn't too likely to trigger in actual uses.  Revert the
patch for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Link: http://lkml.kernel.org/r/20170414232737.GC20350@outlook.office365.com
---
 kernel/cgroup/cgroup-v1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 12e19f0636ea..1dc22f6b49f5 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1146,7 +1146,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
 		 * path is super cold.  Let's just sleep a bit and retry.
 		 */
 		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR_OR_NULL(pinned_sb) ||
+		if (IS_ERR(pinned_sb) ||
 		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			if (!IS_ERR_OR_NULL(pinned_sb))
-- 
cgit v1.2.3


From 6b1bb01bcc5be91452bd3b5bda4300f179fd4053 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Apr 2017 03:12:06 +0200
Subject: bpf: fix cb access in socket filter programs on tail calls

Commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs")
added a fix for socket filter programs such that in i) AF_PACKET the
20 bytes of skb->cb[] area gets zeroed before use in order to not leak
data, and ii) socket filter programs attached to TCP/UDP sockets need
to save/restore these 20 bytes since they are also used by protocol
layers at that time.

The problem is that bpf_prog_run_save_cb() and bpf_prog_run_clear_cb()
only look at the actual attached program to determine whether to zero
or save/restore the skb->cb[] parts. There can be cases where the
actual attached program does not access the skb->cb[], but the program
tail calls into another program which does access this area. In such
a case, the zero or save/restore is currently not performed.

Since the programs we tail call into are unknown at verification time
and can dynamically change, we need to assume that whenever the attached
program performs a tail call, that later programs could access the
skb->cb[], and therefore we need to always set cb_access to 1.

Fixes: ff936a04e5f2 ("bpf: fix cb access in socket filter programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7af0dcc5d755..ee5c969d5963 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -617,6 +617,13 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			if (insn->imm == BPF_FUNC_xdp_adjust_head)
 				prog->xdp_adjust_head = 1;
 			if (insn->imm == BPF_FUNC_tail_call) {
+				/* If we tail call into other programs, we
+				 * cannot make any assumptions since they
+				 * can be replaced dynamically during runtime
+				 * in the program array.
+				 */
+				prog->cb_access = 1;
+
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
 				 * interpeter for every normal call
-- 
cgit v1.2.3


From c2002f983767ea0a53acbb3e21f771e7a7e2ed28 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Apr 2017 03:12:07 +0200
Subject: bpf: fix checking xdp_adjust_head on tail calls

Commit 17bedab27231 ("bpf: xdp: Allow head adjustment in XDP prog")
added the xdp_adjust_head bit to the BPF prog in order to tell drivers
that the program that is to be attached requires support for the XDP
bpf_xdp_adjust_head() helper such that drivers not supporting this
helper can reject the program. There are also drivers that do support
the helper, but need to check for xdp_adjust_head bit in order to move
packet metadata prepended by the firmware away for making headroom.

For these cases, the current check for xdp_adjust_head bit is insufficient
since there can be cases where the program itself does not use the
bpf_xdp_adjust_head() helper, but tail calls into another program that
uses bpf_xdp_adjust_head(). As such, the xdp_adjust_head bit is still
set to 0. Since the first program has no control over which program it
calls into, we need to assume that bpf_xdp_adjust_head() helper is used
upon tail calls. Thus, for the very same reasons in cb_access, set the
xdp_adjust_head bit to 1 when the main program uses tail calls.

Fixes: 17bedab27231 ("bpf: xdp: Allow head adjustment in XDP prog")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ee5c969d5963..821f9e807de5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -623,6 +623,7 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 				 * in the program array.
 				 */
 				prog->cb_access = 1;
+				prog->xdp_adjust_head = 1;
 
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
-- 
cgit v1.2.3


From d879d0b8c183aabeb9a65eba91f3f9e3c7e7b905 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 17 Apr 2017 11:44:27 +0900
Subject: ftrace: Fix function pid filter on instances

When function tracer has a pid filter, it adds a probe to sched_switch
to track if current task can be ignored.  The probe checks the
ftrace_ignore_pid from current tr to filter tasks.  But it misses to
delete the probe when removing an instance so that it can cause a crash
due to the invalid tr pointer (use-after-free).

This is easily reproducible with the following:

  # cd /sys/kernel/debug/tracing
  # mkdir instances/buggy
  # echo $$ > instances/buggy/set_ftrace_pid
  # rmdir instances/buggy

  ============================================================================
  BUG: KASAN: use-after-free in ftrace_filter_pid_sched_switch_probe+0x3d/0x90
  Read of size 8 by task kworker/0:1/17
  CPU: 0 PID: 17 Comm: kworker/0:1 Tainted: G    B           4.11.0-rc3  #198
  Call Trace:
   dump_stack+0x68/0x9f
   kasan_object_err+0x21/0x70
   kasan_report.part.1+0x22b/0x500
   ? ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   kasan_report+0x25/0x30
   __asan_load8+0x5e/0x70
   ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   ? fpid_start+0x130/0x130
   __schedule+0x571/0xce0
   ...

To fix it, use ftrace_clear_pids() to unregister the probe.  As
instance_rmdir() already updated ftrace codes, it can just free the
filter safely.

Link: http://lkml.kernel.org/r/20170417024430.21194-2-namhyung@kernel.org

Fixes: 0c8916c34203 ("tracing: Add rmdir to remove multibuffer instances")
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 9 +++++++++
 kernel/trace/trace.c  | 1 +
 kernel/trace/trace.h  | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 27bb2e61276e..dd3e91d68dc7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5566,6 +5566,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
 	trace_free_pid_list(pid_list);
 }
 
+void ftrace_clear_pids(struct trace_array *tr)
+{
+	mutex_lock(&ftrace_lock);
+
+	clear_ftrace_pids(tr);
+
+	mutex_unlock(&ftrace_lock);
+}
+
 static void ftrace_pid_reset(struct trace_array *tr)
 {
 	mutex_lock(&ftrace_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f35109514a01..d484452ae648 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7402,6 +7402,7 @@ static int instance_rmdir(const char *name)
 
 	tracing_set_nop(tr);
 	event_trace_del_tracer(tr);
+	ftrace_clear_pids(tr);
 	ftrace_destroy_function_files(tr);
 	tracefs_remove_recursive(tr->dir);
 	free_trace_buffers(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ae1cce91fead..d19d52d600d6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -896,6 +896,7 @@ int using_ftrace_ops_list_func(void);
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 void ftrace_init_tracefs_toplevel(struct trace_array *tr,
 				  struct dentry *d_tracer);
+void ftrace_clear_pids(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -914,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
-- 
cgit v1.2.3


From 395102db441abb8fd18fec5dd81428b5120232af Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Mon, 10 Apr 2017 11:50:52 -0400
Subject: sparc64: Use LOCKDEP_SMALL, not PROVE_LOCKING_SMALL

CONFIG_PROVE_LOCKING_SMALL shrinks the memory usage of lockdep so the
kernel text, data, and bss fit in the required 32MB limit, but this
option is not set for every config that enables lockdep.

A 4.10 kernel fails to boot with the console output

    Kernel: Using 8 locked TLB entries for main kernel image.
    hypervisor_tlb_lock[2000000:0:8000000071c007c3:1]: errors with f
    Program terminated

with these config options

    CONFIG_LOCKDEP=y
    CONFIG_LOCK_STAT=y
    CONFIG_PROVE_LOCKING=n

To fix, rename CONFIG_PROVE_LOCKING_SMALL to CONFIG_LOCKDEP_SMALL, and
enable this option with CONFIG_LOCKDEP=y so we get the reduced memory
usage every time lockdep is turned on.

Tested that CONFIG_LOCKDEP_SMALL is set to 'y' if and only if
CONFIG_LOCKDEP is set to 'y'.  When other lockdep-related config options
that select CONFIG_LOCKDEP are enabled (e.g. CONFIG_LOCK_STAT or
CONFIG_PROVE_LOCKING), verified that CONFIG_LOCKDEP_SMALL is also
enabled.

Fixes: e6b5f1be7afe ("config: Adding the new config parameter CONFIG_PROVE_LOCKING_SMALL for sparc")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig                 | 2 +-
 kernel/locking/lockdep_internals.h | 6 +++---
 lib/Kconfig.debug                  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 68ac5c7cd982..a59deaef21e5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,7 +43,7 @@ config SPARC
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
 	select HAVE_ARCH_HARDENED_USERCOPY
-	select PROVE_LOCKING_SMALL if PROVE_LOCKING
+	select LOCKDEP_SMALL if LOCKDEP
 	select ARCH_WANT_RELAX_ORDER
 
 config SPARC32
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c2b88490d857..c08fbd2f5ba9 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,13 +46,13 @@ enum {
 		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
 /*
- * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
  * .data and .bss to fit in required 32MB limit for the kernel. With
- * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
  * So, reduce the static allocations for lockdeps related structures so that
  * everything fits in current required size limit.
  */
-#ifdef CONFIG_PROVE_LOCKING_SMALL
+#ifdef CONFIG_LOCKDEP_SMALL
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 97d62c2da6c2..fa16c0f82d6e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1103,9 +1103,6 @@ config PROVE_LOCKING
 
 	 For more details, see Documentation/locking/lockdep-design.txt.
 
-config PROVE_LOCKING_SMALL
-	bool
-
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -1114,6 +1111,9 @@ config LOCKDEP
 	select KALLSYMS
 	select KALLSYMS_ALL
 
+config LOCKDEP_SMALL
+	bool
+
 config LOCK_STAT
 	bool "Lock usage statistics"
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
-- 
cgit v1.2.3