From ea6eb5e7d15e1838de335609994b4546e2abcaaf Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Thu, 17 Jan 2019 14:30:23 +0100
Subject: tracing: uprobes: Fix typo in pr_fmt string

The subsystem-specific message prefix for uprobes was also
"trace_kprobe: " instead of "trace_uprobe: " as described in
the original commit message.

Link: http://lkml.kernel.org/r/20190117133023.19292-1-andreas.ziegler@fau.de

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: 7257634135c24 ("tracing/probe: Show subsystem name in messages")
Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e335576b9411..19a1a8e19062 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -5,7 +5,7 @@
  * Copyright (C) IBM Corporation, 2010-2012
  * Author:	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
  */
-#define pr_fmt(fmt)	"trace_kprobe: " fmt
+#define pr_fmt(fmt)	"trace_uprobe: " fmt
 
 #include <linux/ctype.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From 0722069a5374b904ec1a67f91249f90e1cfae259 Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Wed, 16 Jan 2019 15:16:29 +0100
Subject: tracing/uprobes: Fix output for multiple string arguments

When printing multiple uprobe arguments as strings the output for the
earlier arguments would also include all later string arguments.

This is best explained in an example:

Consider adding a uprobe to a function receiving two strings as
parameters which is at offset 0xa0 in strlib.so and we want to print
both parameters when the uprobe is hit (on x86_64):

$ echo 'p:func /lib/strlib.so:0xa0 +0(%di):string +0(%si):string' > \
    /sys/kernel/debug/tracing/uprobe_events

When the function is called as func("foo", "bar") and we hit the probe,
the trace file shows a line like the following:

  [...] func: (0x7f7e683706a0) arg1="foobar" arg2="bar"

Note the extra "bar" printed as part of arg1. This behaviour stacks up
for additional string arguments.

The strings are stored in a dynamically growing part of the uprobe
buffer by fetch_store_string() after copying them from userspace via
strncpy_from_user(). The return value of strncpy_from_user() is then
directly used as the required size for the string. However, this does
not take the terminating null byte into account as the documentation
for strncpy_from_user() cleary states that it "[...] returns the
length of the string (not including the trailing NUL)" even though the
null byte will be copied to the destination.

Therefore, subsequent calls to fetch_store_string() will overwrite
the terminating null byte of the most recently fetched string with
the first character of the current string, leading to the
"accumulation" of strings in earlier arguments in the output.

Fix this by incrementing the return value of strncpy_from_user() by
one if we did not hit the maximum buffer size.

Link: http://lkml.kernel.org/r/20190116141629.5752-1-andreas.ziegler@fau.de

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 19a1a8e19062..9bde07c06362 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -160,6 +160,13 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
 	if (ret >= 0) {
 		if (ret == maxlen)
 			dst[ret - 1] = '\0';
+		else
+			/*
+			 * Include the terminating null byte. In this case it
+			 * was copied by strncpy_from_user but not accounted
+			 * for in ret.
+			 */
+			ret++;
 		*(u32 *)dest = make_data_loc(ret, (void *)dst - base);
 	}
 
-- 
cgit v1.2.3


From 1a51c5da5acc6c188c917ba572eebac5f8793432 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 10 Jan 2019 17:17:16 -0800
Subject: perf core: Fix perf_proc_update_handler() bug

The perf_proc_update_handler() handles /proc/sys/kernel/perf_event_max_sample_rate
syctl variable.  When the PMU IRQ handler timing monitoring is disabled, i.e,
when /proc/sys/kernel/perf_cpu_time_max_percent is equal to 0 or 100,
then no modification to sysctl_perf_event_sample_rate is allowed to prevent
possible hang from wrong values.

The problem is that the test to prevent modification is made after the
sysctl variable is modified in perf_proc_update_handler().

You get an error:

  $ echo 10001 >/proc/sys/kernel/perf_event_max_sample_rate
  echo: write error: invalid argument

But the value is still modified causing all sorts of inconsistencies:

  $ cat /proc/sys/kernel/perf_event_max_sample_rate
  10001

This patch fixes the problem by moving the parsing of the value after
the test.

Committer testing:

  # echo 100 > /proc/sys/kernel/perf_cpu_time_max_percent
  # echo 10001 > /proc/sys/kernel/perf_event_max_sample_rate
  -bash: echo: write error: Invalid argument
  # cat /proc/sys/kernel/perf_event_max_sample_rate
  10001
  #

Signed-off-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1547169436-6266-1-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cd13a30f732..e5ede6918050 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
-	if (ret || !write)
-		return ret;
-
+	int ret;
+	int perf_cpu = sysctl_perf_cpu_time_max_percent;
 	/*
 	 * If throttling is disabled don't allow the write:
 	 */
-	if (sysctl_perf_cpu_time_max_percent == 100 ||
-	    sysctl_perf_cpu_time_max_percent == 0)
+	if (write && (perf_cpu == 100 || perf_cpu == 0))
 		return -EINVAL;
 
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
 	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 	update_perf_cpu_limits();
-- 
cgit v1.2.3


From 34d66caf251df91ff27b24a3a786810d29989eca Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Thu, 17 Jan 2019 02:10:59 -0800
Subject: x86/speculation: Remove redundant arch_smt_update() invocation

With commit a74cfffb03b7 ("x86/speculation: Rework SMT state change"),
arch_smt_update() is invoked from each individual CPU hotplug function.

Therefore the extra arch_smt_update() call in the sysfs SMT control is
redundant.

Fixes: a74cfffb03b7 ("x86/speculation: Rework SMT state change")
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <konrad.wilk@oracle.com>
Cc: <dwmw@amazon.co.uk>
Cc: <bp@suse.de>
Cc: <srinivas.eeda@oracle.com>
Cc: <peterz@infradead.org>
Cc: <hpa@zytor.com>
Link: https://lkml.kernel.org/r/e2e064f2-e8ef-42ca-bf4f-76b612964752@default
---
 kernel/cpu.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 91d5c38eb7e5..c0c7f64573ed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2090,10 +2090,8 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 		 */
 		cpuhp_offline_cpu_device(cpu);
 	}
-	if (!ret) {
+	if (!ret)
 		cpu_smt_control = ctrlval;
-		arch_smt_update();
-	}
 	cpu_maps_update_done();
 	return ret;
 }
@@ -2104,7 +2102,6 @@ static int cpuhp_smt_enable(void)
 
 	cpu_maps_update_begin();
 	cpu_smt_control = CPU_SMT_ENABLED;
-	arch_smt_update();
 	for_each_present_cpu(cpu) {
 		/* Skip online CPUs and CPUs on offline nodes */
 		if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
-- 
cgit v1.2.3


From 81f5c6f5db37bf2360b64c304b27b8f499b48367 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 29 Jan 2019 16:38:16 -0800
Subject: bpf: btf: allow typedef func_proto

Current implementation does not allow typedef func_proto.
But it is actually allowed.
  -bash-4.4$ cat t.c
  typedef int (f) (int);
  f *g;
  -bash-4.4$ clang -O2 -g -c -target bpf t.c -Xclang -target-feature -Xclang +dwarfris
  -bash-4.4$ pahole -JV t.o
  File t.o:
  [1] PTR (anon) type_id=2
  [2] TYPEDEF f type_id=3
  [3] FUNC_PROTO (anon) return=4 args=(4 (anon))
  [4] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  -bash-4.4$

This patch related btf verifier to allow such (typedef func_proto)
patterns.

Fixes: 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index befe570be5ba..c57bd10340ed 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1459,7 +1459,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 
 		/* "typedef void new_void", "const void"...etc */
 		if (!btf_type_is_void(next_type) &&
-		    !btf_type_is_fwd(next_type)) {
+		    !btf_type_is_fwd(next_type) &&
+		    !btf_type_is_func_proto(next_type)) {
 			btf_verifier_log_type(env, v->t, "Invalid type_id");
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From b284909abad48b07d3071a9fc9b5692b3e64914b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 30 Jan 2019 07:13:58 -0600
Subject: cpu/hotplug: Fix "SMT disabled by BIOS" detection for KVM

With the following commit:

  73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS")

... the hotplug code attempted to detect when SMT was disabled by BIOS,
in which case it reported SMT as permanently disabled.  However, that
code broke a virt hotplug scenario, where the guest is booted with only
primary CPU threads, and a sibling is brought online later.

The problem is that there doesn't seem to be a way to reliably
distinguish between the HW "SMT disabled by BIOS" case and the virt
"sibling not yet brought online" case.  So the above-mentioned commit
was a bit misguided, as it permanently disabled SMT for both cases,
preventing future virt sibling hotplugs.

Going back and reviewing the original problems which were attempted to
be solved by that commit, when SMT was disabled in BIOS:

  1) /sys/devices/system/cpu/smt/control showed "on" instead of
     "notsupported"; and

  2) vmx_vm_init() was incorrectly showing the L1TF_MSG_SMT warning.

I'd propose that we instead consider #1 above to not actually be a
problem.  Because, at least in the virt case, it's possible that SMT
wasn't disabled by BIOS and a sibling thread could be brought online
later.  So it makes sense to just always default the smt control to "on"
to allow for that possibility (assuming cpuid indicates that the CPU
supports SMT).

The real problem is #2, which has a simple fix: change vmx_vm_init() to
query the actual current SMT state -- i.e., whether any siblings are
currently online -- instead of looking at the SMT "control" sysfs value.

So fix it by:

  a) reverting the original "fix" and its followup fix:

     73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS")
     bc2d8d262cba ("cpu/hotplug: Fix SMT supported evaluation")

     and

  b) changing vmx_vm_init() to query the actual current SMT state --
     instead of the sysfs control value -- to determine whether the L1TF
     warning is needed.  This also requires the 'sched_smt_present'
     variable to exported, instead of 'cpu_smt_control'.

Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS")
Reported-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Joe Mario <jmario@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kvm@vger.kernel.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/e3a85d585da28cc333ecbc1e78ee9216e6da9396.1548794349.git.jpoimboe@redhat.com
---
 arch/x86/kernel/cpu/bugs.c |  2 +-
 arch/x86/kvm/vmx/vmx.c     |  3 ++-
 include/linux/cpu.h        |  2 --
 kernel/cpu.c               | 33 ++++-----------------------------
 kernel/sched/fair.c        |  1 +
 kernel/smp.c               |  2 --
 6 files changed, 8 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1de0f4170178..01874d54f4fd 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -71,7 +71,7 @@ void __init check_bugs(void)
 	 * identify_boot_cpu() initialized SMT support information, let the
 	 * core code know.
 	 */
-	cpu_smt_check_topology_early();
+	cpu_smt_check_topology();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {
 		pr_info("CPU: ");
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4341175339f3..95d618045001 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -26,6 +26,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/smt.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
 #include <linux/trace_events.h>
@@ -6823,7 +6824,7 @@ static int vmx_vm_init(struct kvm *kvm)
 			 * Warn upon starting the first VM in a potentially
 			 * insecure environment.
 			 */
-			if (cpu_smt_control == CPU_SMT_ENABLED)
+			if (sched_smt_active())
 				pr_warn_once(L1TF_MSG_SMT);
 			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
 				pr_warn_once(L1TF_MSG_L1D);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 218df7f4d3e1..5041357d0297 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -180,12 +180,10 @@ enum cpuhp_smt_control {
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
 extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology_early(void);
 extern void cpu_smt_check_topology(void);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
 static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology_early(void) { }
 static inline void cpu_smt_check_topology(void) { }
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c0c7f64573ed..d1c6d152da89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -376,9 +376,6 @@ void __weak arch_smt_update(void) { }
 
 #ifdef CONFIG_HOTPLUG_SMT
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
-EXPORT_SYMBOL_GPL(cpu_smt_control);
-
-static bool cpu_smt_available __read_mostly;
 
 void __init cpu_smt_disable(bool force)
 {
@@ -397,25 +394,11 @@ void __init cpu_smt_disable(bool force)
 
 /*
  * The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code before non boot CPUs
- * are brought up.
- */
-void __init cpu_smt_check_topology_early(void)
-{
-	if (!topology_smt_supported())
-		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-}
-
-/*
- * If SMT was disabled by BIOS, detect it here, after the CPUs have been
- * brought online. This ensures the smt/l1tf sysfs entries are consistent
- * with reality. cpu_smt_available is set to true during the bringup of non
- * boot CPUs when a SMT sibling is detected. Note, this may overwrite
- * cpu_smt_control's previous setting.
+ * CPU identification. Called from architecture code.
  */
 void __init cpu_smt_check_topology(void)
 {
-	if (!cpu_smt_available)
+	if (!topology_smt_supported())
 		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
 }
 
@@ -428,18 +411,10 @@ early_param("nosmt", smt_cmdline_disable);
 
 static inline bool cpu_smt_allowed(unsigned int cpu)
 {
-	if (topology_is_primary_thread(cpu))
+	if (cpu_smt_control == CPU_SMT_ENABLED)
 		return true;
 
-	/*
-	 * If the CPU is not a 'primary' thread and the booted_once bit is
-	 * set then the processor has SMT support. Store this information
-	 * for the late check of SMT support in cpu_smt_check_topology().
-	 */
-	if (per_cpu(cpuhp_state, cpu).booted_once)
-		cpu_smt_available = true;
-
-	if (cpu_smt_control == CPU_SMT_ENABLED)
+	if (topology_is_primary_thread(cpu))
 		return true;
 
 	/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50aa2aba69bd..310d0637fe4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5980,6 +5980,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 
 #ifdef CONFIG_SCHED_SMT
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
 
 static inline void set_idle_cores(int cpu, int val)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 163c451af42e..f4cf1b0bb3b8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,8 +584,6 @@ void __init smp_init(void)
 		num_nodes, (num_nodes > 1 ? "s" : ""),
 		num_cpus,  (num_cpus  > 1 ? "s" : ""));
 
-	/* Final decision about SMT support */
-	cpu_smt_check_topology();
 	/* Any cleanup work */
 	smp_cpus_done(setup_max_cpus);
 }
-- 
cgit v1.2.3


From 2c1cf00eeacb784781cf1c9896b8af001246d339 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 31 Jan 2019 13:57:58 +0100
Subject: relay: check return of create_buf_file() properly

If create_buf_file() returns an error, don't try to reference it later
as a valid dentry pointer.

This problem was exposed when debugfs started to return errors instead
of just NULL for some calls when they do not succeed properly.

Also, the check for WARN_ON(dentry) was just wrong :)

Reported-by: Kees Cook <keescook@chromium.org>
Reported-and-tested-by: syzbot+16c3a70e1e9b29346c43@syzkaller.appspotmail.com
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Fixes: ff9fb72bc077 ("debugfs: return error values, not NULL")
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/relay.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..9e0f52375487 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
 	dentry = chan->cb->create_buf_file(tmpname, chan->parent,
 					   S_IRUSR, buf,
 					   &chan->is_global);
+	if (IS_ERR(dentry))
+		dentry = NULL;
 
 	kfree(tmpname);
 
@@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 		dentry = chan->cb->create_buf_file(NULL, NULL,
 						   S_IRUSR, buf,
 						   &chan->is_global);
-		if (WARN_ON(dentry))
+		if (IS_ERR_OR_NULL(dentry))
 			goto free_buf;
 	}
 
-- 
cgit v1.2.3


From 6cab5e90ab2bd323c9f3811b6c70a4687df51e27 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 28 Jan 2019 18:43:34 -0800
Subject: bpf: run bpf programs with preemption disabled

Disabled preemption is necessary for proper access to per-cpu maps
from BPF programs.

But the sender side of socket filters didn't have preemption disabled:
unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN

and a combination of af_packet with tun device didn't disable either:
tpacket_snd->packet_direct_xmit->packet_pick_tx_queue->ndo_select_queue->
  tun_select_queue->tun_ebpf_select_queue->bpf_prog_run_clear_cb->BPF_PROG_RUN

Disable preemption before executing BPF programs (both classic and extended).

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 21 ++++++++++++++++++---
 kernel/bpf/cgroup.c    |  2 +-
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ad106d845b22..e532fcc6e4b5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -591,8 +591,8 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->data;
 }
 
-static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
-				       struct sk_buff *skb)
+static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
+					 struct sk_buff *skb)
 {
 	u8 *cb_data = bpf_skb_cb(skb);
 	u8 cb_saved[BPF_SKB_CB_LEN];
@@ -611,15 +611,30 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
 	return res;
 }
 
+static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
+				       struct sk_buff *skb)
+{
+	u32 res;
+
+	preempt_disable();
+	res = __bpf_prog_run_save_cb(prog, skb);
+	preempt_enable();
+	return res;
+}
+
 static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 					struct sk_buff *skb)
 {
 	u8 *cb_data = bpf_skb_cb(skb);
+	u32 res;
 
 	if (unlikely(prog->cb_access))
 		memset(cb_data, 0, BPF_SKB_CB_LEN);
 
-	return BPF_PROG_RUN(prog, skb);
+	preempt_disable();
+	res = BPF_PROG_RUN(prog, skb);
+	preempt_enable();
+	return res;
 }
 
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ab612fe9862f..d17d05570a3f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-				 bpf_prog_run_save_cb);
+				 __bpf_prog_run_save_cb);
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
 	skb->sk = save_sk;
-- 
cgit v1.2.3


From a89fac57b5d080771efd4d71feaae19877cf68f0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 30 Jan 2019 18:12:43 -0800
Subject: bpf: fix lockdep false positive in percpu_freelist

Lockdep warns about false positive:
[   12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40
[   12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   12.493275]  (&rq->lock){-.-.}
[   12.493276]
[   12.493276]
[   12.493276] and interrupts could create inverse lock ordering between them.
[   12.493276]
[   12.494435]
[   12.494435] other info that might help us debug this:
[   12.494979]  Possible interrupt unsafe locking scenario:
[   12.494979]
[   12.495518]        CPU0                    CPU1
[   12.495879]        ----                    ----
[   12.496243]   lock(&head->lock);
[   12.496502]                                local_irq_disable();
[   12.496969]                                lock(&rq->lock);
[   12.497431]                                lock(&head->lock);
[   12.497890]   <Interrupt>
[   12.498104]     lock(&rq->lock);
[   12.498368]
[   12.498368]  *** DEADLOCK ***
[   12.498368]
[   12.498837] 1 lock held by dd/276:
[   12.499110]  #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240
[   12.499747]
[   12.499747] the shortest dependencies between 2nd lock and 1st lock:
[   12.500389]  -> (&rq->lock){-.-.} {
[   12.500669]     IN-HARDIRQ-W at:
[   12.500934]                       _raw_spin_lock+0x2f/0x40
[   12.501373]                       scheduler_tick+0x4c/0xf0
[   12.501812]                       update_process_times+0x40/0x50
[   12.502294]                       tick_periodic+0x27/0xb0
[   12.502723]                       tick_handle_periodic+0x1f/0x60
[   12.503203]                       timer_interrupt+0x11/0x20
[   12.503651]                       __handle_irq_event_percpu+0x43/0x2c0
[   12.504167]                       handle_irq_event_percpu+0x20/0x50
[   12.504674]                       handle_irq_event+0x37/0x60
[   12.505139]                       handle_level_irq+0xa7/0x120
[   12.505601]                       handle_irq+0xa1/0x150
[   12.506018]                       do_IRQ+0x77/0x140
[   12.506411]                       ret_from_intr+0x0/0x1d
[   12.506834]                       _raw_spin_unlock_irqrestore+0x53/0x60
[   12.507362]                       __setup_irq+0x481/0x730
[   12.507789]                       setup_irq+0x49/0x80
[   12.508195]                       hpet_time_init+0x21/0x32
[   12.508644]                       x86_late_time_init+0xb/0x16
[   12.509106]                       start_kernel+0x390/0x42a
[   12.509554]                       secondary_startup_64+0xa4/0xb0
[   12.510034]     IN-SOFTIRQ-W at:
[   12.510305]                       _raw_spin_lock+0x2f/0x40
[   12.510772]                       try_to_wake_up+0x1c7/0x4e0
[   12.511220]                       swake_up_locked+0x20/0x40
[   12.511657]                       swake_up_one+0x1a/0x30
[   12.512070]                       rcu_process_callbacks+0xc5/0x650
[   12.512553]                       __do_softirq+0xe6/0x47b
[   12.512978]                       irq_exit+0xc3/0xd0
[   12.513372]                       smp_apic_timer_interrupt+0xa9/0x250
[   12.513876]                       apic_timer_interrupt+0xf/0x20
[   12.514343]                       default_idle+0x1c/0x170
[   12.514765]                       do_idle+0x199/0x240
[   12.515159]                       cpu_startup_entry+0x19/0x20
[   12.515614]                       start_kernel+0x422/0x42a
[   12.516045]                       secondary_startup_64+0xa4/0xb0
[   12.516521]     INITIAL USE at:
[   12.516774]                      _raw_spin_lock_irqsave+0x38/0x50
[   12.517258]                      rq_attach_root+0x16/0xd0
[   12.517685]                      sched_init+0x2f2/0x3eb
[   12.518096]                      start_kernel+0x1fb/0x42a
[   12.518525]                      secondary_startup_64+0xa4/0xb0
[   12.518986]   }
[   12.519132]   ... key      at: [<ffffffff82b7bc28>] __key.71384+0x0/0x8
[   12.519649]   ... acquired at:
[   12.519892]    pcpu_freelist_pop+0x7b/0xd0
[   12.520221]    bpf_get_stackid+0x1d2/0x4d0
[   12.520563]    ___bpf_prog_run+0x8b4/0x11a0
[   12.520887]
[   12.521008] -> (&head->lock){+...} {
[   12.521292]    HARDIRQ-ON-W at:
[   12.521539]                     _raw_spin_lock+0x2f/0x40
[   12.521950]                     pcpu_freelist_push+0x2a/0x40
[   12.522396]                     bpf_get_stackid+0x494/0x4d0
[   12.522828]                     ___bpf_prog_run+0x8b4/0x11a0
[   12.523296]    INITIAL USE at:
[   12.523537]                    _raw_spin_lock+0x2f/0x40
[   12.523944]                    pcpu_freelist_populate+0xc0/0x120
[   12.524417]                    htab_map_alloc+0x405/0x500
[   12.524835]                    __do_sys_bpf+0x1a3/0x1a90
[   12.525253]                    do_syscall_64+0x4a/0x180
[   12.525659]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   12.526167]  }
[   12.526311]  ... key      at: [<ffffffff838f7668>] __key.13130+0x0/0x8
[   12.526812]  ... acquired at:
[   12.527047]    __lock_acquire+0x521/0x1350
[   12.527371]    lock_acquire+0x98/0x190
[   12.527680]    _raw_spin_lock+0x2f/0x40
[   12.527994]    pcpu_freelist_push+0x2a/0x40
[   12.528325]    bpf_get_stackid+0x494/0x4d0
[   12.528645]    ___bpf_prog_run+0x8b4/0x11a0
[   12.528970]
[   12.529092]
[   12.529092] stack backtrace:
[   12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475
[   12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   12.530750] Call Trace:
[   12.530948]  dump_stack+0x5f/0x8b
[   12.531248]  check_usage_backwards+0x10c/0x120
[   12.531598]  ? ___bpf_prog_run+0x8b4/0x11a0
[   12.531935]  ? mark_lock+0x382/0x560
[   12.532229]  mark_lock+0x382/0x560
[   12.532496]  ? print_shortest_lock_dependencies+0x180/0x180
[   12.532928]  __lock_acquire+0x521/0x1350
[   12.533271]  ? find_get_entry+0x17f/0x2e0
[   12.533586]  ? find_get_entry+0x19c/0x2e0
[   12.533902]  ? lock_acquire+0x98/0x190
[   12.534196]  lock_acquire+0x98/0x190
[   12.534482]  ? pcpu_freelist_push+0x2a/0x40
[   12.534810]  _raw_spin_lock+0x2f/0x40
[   12.535099]  ? pcpu_freelist_push+0x2a/0x40
[   12.535432]  pcpu_freelist_push+0x2a/0x40
[   12.535750]  bpf_get_stackid+0x494/0x4d0
[   12.536062]  ___bpf_prog_run+0x8b4/0x11a0

It has been explained that is a false positive here:
https://lkml.org/lkml/2018/7/25/756
Recap:
- stackmap uses pcpu_freelist
- The lock in pcpu_freelist is a percpu lock
- stackmap is only used by tracing bpf_prog
- A tracing bpf_prog cannot be run if another bpf_prog
  has already been running (ensured by the percpu bpf_prog_active counter).

Eric pointed out that this lockdep splats stops other
legit lockdep splats in selftests/bpf/test_progs.c.

Fix this by calling local_irq_save/restore for stackmap.

Another false positive had also been worked around by calling
local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat").
That commit added unnecessary irq_save/restore to fast path of
bpf hash map. irqs are already disabled at that point, since htab
is holding per bucket spin_lock with irqsave.

Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop
function w/o irqsave and convert pcpu_freelist_push/pop to irqsave
to be used elsewhere (right now only in stackmap).
It stops lockdep false positive in stackmap with a bit of acceptable overhead.

Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/hashtab.c         |  4 ++--
 kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++++++------------
 kernel/bpf/percpu_freelist.h |  4 ++++
 3 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..f9274114c88d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	}
 
 	if (htab_is_prealloc(htab)) {
-		pcpu_freelist_push(&htab->freelist, &l->fnode);
+		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
 		l->htab = htab;
@@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 		} else {
 			struct pcpu_freelist_node *l;
 
-			l = pcpu_freelist_pop(&htab->freelist);
+			l = __pcpu_freelist_pop(&htab->freelist);
 			if (!l)
 				return ERR_PTR(-E2BIG);
 			l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
 	free_percpu(s->freelist);
 }
 
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
-					struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+					 struct pcpu_freelist_node *node)
 {
 	raw_spin_lock(&head->lock);
 	node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
 			struct pcpu_freelist_node *node)
 {
 	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
 
-	__pcpu_freelist_push(head, node);
+	___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+			struct pcpu_freelist_node *node)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__pcpu_freelist_push(s, node);
+	local_irq_restore(flags);
 }
 
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	for_each_possible_cpu(cpu) {
 again:
 		head = per_cpu_ptr(s->freelist, cpu);
-		__pcpu_freelist_push(head, buf);
+		___pcpu_freelist_push(head, buf);
 		i++;
 		buf += elem_size;
 		if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
 	local_irq_restore(flags);
 }
 
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
-	unsigned long flags;
 	int orig_cpu, cpu;
 
-	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock_irqrestore(&head->lock, flags);
+			raw_spin_unlock(&head->lock);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu) {
-			local_irq_restore(flags);
+		if (cpu == orig_cpu)
 			return NULL;
-		}
 	}
 }
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_node *ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ret = __pcpu_freelist_pop(s);
+	local_irq_restore(flags);
+	return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
 	struct pcpu_freelist_node *next;
 };
 
+/* pcpu_freelist_* do spin_lock_irqsave. */
 void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 			    u32 nr_elems);
 int pcpu_freelist_init(struct pcpu_freelist *);
-- 
cgit v1.2.3


From e16ec34039c701594d55d08a5aa49ee3e1abc821 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 30 Jan 2019 18:12:44 -0800
Subject: bpf: fix potential deadlock in bpf_prog_register

Lockdep found a potential deadlock between cpu_hotplug_lock, bpf_event_mutex, and cpuctx_mutex:
[   13.007000] WARNING: possible circular locking dependency detected
[   13.007587] 5.0.0-rc3-00018-g2fa53f892422-dirty #477 Not tainted
[   13.008124] ------------------------------------------------------
[   13.008624] test_progs/246 is trying to acquire lock:
[   13.009030] 0000000094160d1d (tracepoints_mutex){+.+.}, at: tracepoint_probe_register_prio+0x2d/0x300
[   13.009770]
[   13.009770] but task is already holding lock:
[   13.010239] 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60
[   13.010877]
[   13.010877] which lock already depends on the new lock.
[   13.010877]
[   13.011532]
[   13.011532] the existing dependency chain (in reverse order) is:
[   13.012129]
[   13.012129] -> #4 (bpf_event_mutex){+.+.}:
[   13.012582]        perf_event_query_prog_array+0x9b/0x130
[   13.013016]        _perf_ioctl+0x3aa/0x830
[   13.013354]        perf_ioctl+0x2e/0x50
[   13.013668]        do_vfs_ioctl+0x8f/0x6a0
[   13.014003]        ksys_ioctl+0x70/0x80
[   13.014320]        __x64_sys_ioctl+0x16/0x20
[   13.014668]        do_syscall_64+0x4a/0x180
[   13.015007]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.015469]
[   13.015469] -> #3 (&cpuctx_mutex){+.+.}:
[   13.015910]        perf_event_init_cpu+0x5a/0x90
[   13.016291]        perf_event_init+0x1b2/0x1de
[   13.016654]        start_kernel+0x2b8/0x42a
[   13.016995]        secondary_startup_64+0xa4/0xb0
[   13.017382]
[   13.017382] -> #2 (pmus_lock){+.+.}:
[   13.017794]        perf_event_init_cpu+0x21/0x90
[   13.018172]        cpuhp_invoke_callback+0xb3/0x960
[   13.018573]        _cpu_up+0xa7/0x140
[   13.018871]        do_cpu_up+0xa4/0xc0
[   13.019178]        smp_init+0xcd/0xd2
[   13.019483]        kernel_init_freeable+0x123/0x24f
[   13.019878]        kernel_init+0xa/0x110
[   13.020201]        ret_from_fork+0x24/0x30
[   13.020541]
[   13.020541] -> #1 (cpu_hotplug_lock.rw_sem){++++}:
[   13.021051]        static_key_slow_inc+0xe/0x20
[   13.021424]        tracepoint_probe_register_prio+0x28c/0x300
[   13.021891]        perf_trace_event_init+0x11f/0x250
[   13.022297]        perf_trace_init+0x6b/0xa0
[   13.022644]        perf_tp_event_init+0x25/0x40
[   13.023011]        perf_try_init_event+0x6b/0x90
[   13.023386]        perf_event_alloc+0x9a8/0xc40
[   13.023754]        __do_sys_perf_event_open+0x1dd/0xd30
[   13.024173]        do_syscall_64+0x4a/0x180
[   13.024519]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.024968]
[   13.024968] -> #0 (tracepoints_mutex){+.+.}:
[   13.025434]        __mutex_lock+0x86/0x970
[   13.025764]        tracepoint_probe_register_prio+0x2d/0x300
[   13.026215]        bpf_probe_register+0x40/0x60
[   13.026584]        bpf_raw_tracepoint_open.isra.34+0xa4/0x130
[   13.027042]        __do_sys_bpf+0x94f/0x1a90
[   13.027389]        do_syscall_64+0x4a/0x180
[   13.027727]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.028171]
[   13.028171] other info that might help us debug this:
[   13.028171]
[   13.028807] Chain exists of:
[   13.028807]   tracepoints_mutex --> &cpuctx_mutex --> bpf_event_mutex
[   13.028807]
[   13.029666]  Possible unsafe locking scenario:
[   13.029666]
[   13.030140]        CPU0                    CPU1
[   13.030510]        ----                    ----
[   13.030875]   lock(bpf_event_mutex);
[   13.031166]                                lock(&cpuctx_mutex);
[   13.031645]                                lock(bpf_event_mutex);
[   13.032135]   lock(tracepoints_mutex);
[   13.032441]
[   13.032441]  *** DEADLOCK ***
[   13.032441]
[   13.032911] 1 lock held by test_progs/246:
[   13.033239]  #0: 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60
[   13.033909]
[   13.033909] stack backtrace:
[   13.034258] CPU: 1 PID: 246 Comm: test_progs Not tainted 5.0.0-rc3-00018-g2fa53f892422-dirty #477
[   13.034964] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   13.035657] Call Trace:
[   13.035859]  dump_stack+0x5f/0x8b
[   13.036130]  print_circular_bug.isra.37+0x1ce/0x1db
[   13.036526]  __lock_acquire+0x1158/0x1350
[   13.036852]  ? lock_acquire+0x98/0x190
[   13.037154]  lock_acquire+0x98/0x190
[   13.037447]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.037876]  __mutex_lock+0x86/0x970
[   13.038167]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.038600]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.039028]  ? __mutex_lock+0x86/0x970
[   13.039337]  ? __mutex_lock+0x24a/0x970
[   13.039649]  ? bpf_probe_register+0x1d/0x60
[   13.039992]  ? __bpf_trace_sched_wake_idle_without_ipi+0x10/0x10
[   13.040478]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.040906]  tracepoint_probe_register_prio+0x2d/0x300
[   13.041325]  bpf_probe_register+0x40/0x60
[   13.041649]  bpf_raw_tracepoint_open.isra.34+0xa4/0x130
[   13.042068]  ? __might_fault+0x3e/0x90
[   13.042374]  __do_sys_bpf+0x94f/0x1a90
[   13.042678]  do_syscall_64+0x4a/0x180
[   13.042975]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.043382] RIP: 0033:0x7f23b10a07f9
[   13.045155] RSP: 002b:00007ffdef42fdd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   13.045759] RAX: ffffffffffffffda RBX: 00007ffdef42ff70 RCX: 00007f23b10a07f9
[   13.046326] RDX: 0000000000000070 RSI: 00007ffdef42fe10 RDI: 0000000000000011
[   13.046893] RBP: 00007ffdef42fdf0 R08: 0000000000000038 R09: 00007ffdef42fe10
[   13.047462] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
[   13.048029] R13: 0000000000000016 R14: 00007f23b1db4690 R15: 0000000000000000

Since tracepoints_mutex will be taken in tracepoint_probe_register/unregister()
there is no need to take bpf_event_mutex too.
bpf_event_mutex is protecting modifications to prog array used in kprobe/perf bpf progs.
bpf_raw_tracepoints don't need to take this mutex.

Fixes: c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8b068adb9da1..f1a86a0d881d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-	int err;
-
-	mutex_lock(&bpf_event_mutex);
-	err = __bpf_probe_register(btp, prog);
-	mutex_unlock(&bpf_event_mutex);
-	return err;
+	return __bpf_probe_register(btp, prog);
 }
 
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-	int err;
-
-	mutex_lock(&bpf_event_mutex);
-	err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
-	mutex_unlock(&bpf_event_mutex);
-	return err;
+	return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
 }
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
-- 
cgit v1.2.3


From 7c4cd051add3d00bbff008a133c936c515eaa8fe Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 30 Jan 2019 18:12:45 -0800
Subject: bpf: Fix syscall's stackmap lookup potential deadlock

The map_lookup_elem used to not acquiring spinlock
in order to optimize the reader.

It was true until commit 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
The syscall's map_lookup_elem(stackmap) calls bpf_stackmap_copy().
bpf_stackmap_copy() may find the elem no longer needed after the copy is done.
If that is the case, pcpu_freelist_push() saves this elem for reuse later.
This push requires a spinlock.

If a tracing bpf_prog got run in the middle of the syscall's
map_lookup_elem(stackmap) and this tracing bpf_prog is calling
bpf_get_stackid(stackmap) which also requires the same pcpu_freelist's
spinlock, it may end up with a dead lock situation as reported by
Eric Dumazet in https://patchwork.ozlabs.org/patch/1030266/

The situation is the same as the syscall's map_update_elem() which
needs to acquire the pcpu_freelist's spinlock and could race
with tracing bpf_prog.  Hence, this patch fixes it by protecting
bpf_stackmap_copy() with this_cpu_inc(bpf_prog_active)
to prevent tracing bpf_prog from running.

A later syscall's map_lookup_elem commit f1a2e44a3aec ("bpf: add queue and stack maps")
also acquires a spinlock and races with tracing bpf_prog similarly.
Hence, this patch is forward looking and protects the majority
of the map lookups.  bpf_map_offload_lookup_elem() is the exception
since it is for network bpf_prog only (i.e. never called by tracing
bpf_prog).

Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..8577bb7f8be6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -713,8 +713,13 @@ static int map_lookup_elem(union bpf_attr *attr)
 
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		goto done;
+	}
+
+	preempt_disable();
+	this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -744,7 +749,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 		}
 		rcu_read_unlock();
 	}
+	this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
+done:
 	if (err)
 		goto free_value;
 
-- 
cgit v1.2.3


From 8fb335e078378c8426fabeed1ebee1fbf915690c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 1 Feb 2019 14:20:24 -0800
Subject: kernel/exit.c: release ptraced tasks before zap_pid_ns_processes

Currently, exit_ptrace() adds all ptraced tasks in a dead list, then
zap_pid_ns_processes() waits on all tasks in a current pidns, and only
then are tasks from the dead list released.

zap_pid_ns_processes() can get stuck on waiting tasks from the dead
list.  In this case, we will have one unkillable process with one or
more dead children.

Thanks to Oleg for the advice to release tasks in find_child_reaper().

Link: http://lkml.kernel.org/r/20190110175200.12442-1-avagin@gmail.com
Fixes: 7c8bd2322c7f ("exit: ptrace: shift "reap dead" code from exit_ptrace() to forget_original_parent()")
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3fb7be001964..2639a30a8aa5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -558,12 +558,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
 	return NULL;
 }
 
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+						struct list_head *dead)
 	__releases(&tasklist_lock)
 	__acquires(&tasklist_lock)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *reaper = pid_ns->child_reaper;
+	struct task_struct *p, *n;
 
 	if (likely(reaper != father))
 		return reaper;
@@ -579,6 +581,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
 		panic("Attempted to kill init! exitcode=0x%08x\n",
 			father->signal->group_exit_code ?: father->exit_code);
 	}
+
+	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+
 	zap_pid_ns_processes(pid_ns);
 	write_lock_irq(&tasklist_lock);
 
@@ -668,7 +676,7 @@ static void forget_original_parent(struct task_struct *father,
 		exit_ptrace(father, dead);
 
 	/* Can drop and reacquire tasklist_lock */
-	reaper = find_child_reaper(father);
+	reaper = find_child_reaper(father, dead);
 	if (list_empty(&father->children))
 		return;
 
-- 
cgit v1.2.3


From 1b69ac6b40ebd85eed73e4dbccde2a36961ab990 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 1 Feb 2019 14:20:42 -0800
Subject: psi: fix aggregation idle shut-off

psi has provisions to shut off the periodic aggregation worker when
there is a period of no task activity - and thus no data that needs
aggregating.  However, while developing psi monitoring, Suren noticed
that the aggregation clock currently won't stay shut off for good.

Debugging this revealed a flaw in the idle design: an aggregation run
will see no task activity and decide to go to sleep; shortly thereafter,
the kworker thread that executed the aggregation will go idle and cause
a scheduling change, during which the psi callback will kick the
!pending worker again.  This will ping-pong forever, and is equivalent
to having no shut-off logic at all (but with more code!)

Fix this by exempting aggregation workers from psi's clock waking logic
when the state change is them going to sleep.  To do this, tag workers
with the last work function they executed, and if in psi we see a worker
going to sleep after aggregating psi data, we will not reschedule the
aggregation work item.

What if the worker is also executing other items before or after?

Any psi state times that were incurred by work items preceding the
aggregation work will have been collected from the per-cpu buckets
during the aggregation itself.  If there are work items following the
aggregation work, the worker's last_func tag will be overwritten and the
aggregator will be kept alive to process this genuine new activity.

If the aggregation work is the last thing the worker does, and we decide
to go idle, the brief period of non-idle time incurred between the
aggregation run and the kworker's dequeue will be stranded in the
per-cpu buckets until the clock is woken by later activity.  But that
should not be a problem.  The buckets can hold 4s worth of time, and
future activity will wake the clock with a 2s delay, giving us 2s worth
of data we can leave behind when disabling aggregation.  If it takes a
worker more than two seconds to go idle after it finishes its last work
item, we likely have bigger problems in the system, and won't notice one
sample that was averaged with a bogus per-CPU weight.

Link: http://lkml.kernel.org/r/20190116193501.1910-1-hannes@cmpxchg.org
Fixes: eb414681d5a0 ("psi: pressure stall information for CPU, memory, and IO")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c          | 21 +++++++++++++++++----
 kernel/workqueue.c          | 23 +++++++++++++++++++++++
 kernel/workqueue_internal.h |  6 +++++-
 3 files changed, 45 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index fe24de3fbc93..c3484785b179 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -124,6 +124,7 @@
  * sampling of the aggregate task states would be.
  */
 
+#include "../workqueue_internal.h"
 #include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
@@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 			groupc->tasks[t]++;
 
 	write_seqcount_end(&groupc->seq);
-
-	if (!delayed_work_pending(&group->clock_work))
-		schedule_delayed_work(&group->clock_work, PSI_FREQ);
 }
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
+	bool wake_clock = true;
 	void *iter = NULL;
 
 	if (!task->pid)
@@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	task->psi_flags &= ~clear;
 	task->psi_flags |= set;
 
-	while ((group = iterate_groups(task, &iter)))
+	/*
+	 * Periodic aggregation shuts off if there is a period of no
+	 * task changes, so we wake it back up if necessary. However,
+	 * don't do this if the task change is the aggregation worker
+	 * itself going to sleep, or we'll ping-pong forever.
+	 */
+	if (unlikely((clear & TSK_RUNNING) &&
+		     (task->flags & PF_WQ_WORKER) &&
+		     wq_worker_last_func(task) == psi_update_work))
+		wake_clock = false;
+
+	while ((group = iterate_groups(task, &iter))) {
 		psi_group_change(group, cpu, clear, set);
+		if (wake_clock && !delayed_work_pending(&group->clock_work))
+			schedule_delayed_work(&group->clock_work, PSI_FREQ);
+	}
 }
 
 void psi_memstall_tick(struct task_struct *task, int cpu)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 392be4b252f6..fc5d23d752a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -909,6 +909,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
 	return to_wakeup ? to_wakeup->task : NULL;
 }
 
+/**
+ * wq_worker_last_func - retrieve worker's last work function
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+	struct worker *worker = kthread_data(task);
+
+	return worker->last_func;
+}
+
 /**
  * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: self
@@ -2184,6 +2204,9 @@ __acquires(&pool->lock)
 	if (unlikely(cpu_intensive))
 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
+	/* tag the worker for identification in schedule() */
+	worker->last_func = worker->current_func;
+
 	/* we're done with it, release */
 	hash_del(&worker->hentry);
 	worker->current_work = NULL;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 66fbb5a9e633..cb68b03ca89a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -53,6 +53,9 @@ struct worker {
 
 	/* used only by rescuers to point to the target workqueue */
 	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
+
+	/* used by the scheduler to determine a worker's last known identity */
+	work_func_t		last_func;
 };
 
 /**
@@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void)
 
 /*
  * Scheduler hooks for concurrency managed workqueue.  Only to be used from
- * sched/core.c and workqueue.c.
+ * sched/ and workqueue.c.
  */
 void wq_worker_waking_up(struct task_struct *task, int cpu);
 struct task_struct *wq_worker_sleeping(struct task_struct *task);
+work_func_t wq_worker_last_func(struct task_struct *task);
 
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
-- 
cgit v1.2.3